1 |
// ****************************************************************************** |
2 |
// * * |
3 |
// * This file is part of XviD, a free MPEG-4 video encoder/decoder * |
4 |
// * * |
5 |
// * * |
6 |
// * XviD is free software; you can redistribute it and/or modify it * |
7 |
// * under the terms of the GNU General Public License as published by * |
8 |
// * the Free Software Foundation; either version 2 of the License, or * |
9 |
// * (at your option) any later version. * |
10 |
// * * |
11 |
// * XviD is distributed in the hope that it will be useful, but * |
12 |
// * WITHOUT ANY WARRANTY; without even the implied warranty of * |
13 |
// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
14 |
// * GNU General Public License for more details. * |
15 |
// * * |
16 |
// * You should have received a copy of the GNU General Public License * |
17 |
// * along with this program; if not, write to the Free Software * |
18 |
// * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * |
19 |
// * * |
20 |
// ****************************************************************************** |
21 |
// |
22 |
// ****************************************************************************** |
23 |
// * * |
24 |
// * fdct_ia64.s, IA-64 optimized forward DCT * |
25 |
// * * |
26 |
// * Completed version provided by Intel at AppNote AP-922 * |
27 |
// * http://developer.intel.com/software/products/college/ia32/strmsimd/ * |
28 |
// * Copyright (C) 1999 Intel Corporation, * |
29 |
// * * |
30 |
// * This version was implemented during an IA-64 practical training at * |
31 |
// * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) * |
32 |
// * Copyright (C) 2002 - Stephan Krause, Ingo-Marc Weber, Daniel Kallfass * |
33 |
// * * |
34 |
// * For more information visit the XviD homepage: http://www.xvid.org * |
35 |
// * * |
36 |
// ****************************************************************************** |
37 |
// |
38 |
// ****************************************************************************** |
39 |
// * * |
40 |
// * Revision history: * |
41 |
// * * |
42 |
// * 24.07.2002 Initial Version * |
43 |
// * * |
44 |
// ****************************************************************************** |
45 |
|
46 |
|
47 |
// This is a fast precise implementation of 8x8 Discrete Cosine Transform |
48 |
// published in Intel Application Note 922 from 1999 and optimized for IA-64. |
49 |
// |
50 |
// An unoptimized "straight forward" version can be found at the end of this file. |
51 |
|
52 |
|
53 |
.pred.safe_across_calls p1-p5,p16-p63 |
54 |
.text |
55 |
.align 16 |
56 |
.global fdct_ia64# |
57 |
.proc fdct_ia64# |
58 |
fdct_ia64: |
59 |
.prologue |
60 |
alloc r14 = ar.pfs, 1, 56, 0, 0 |
61 |
// Save constants |
62 |
mov r31 = 0x32ec // c0 = tan(1pi/16) |
63 |
mov r30 = 0x6a0a // c1 = tan(2pi/16) |
64 |
mov r29 = 0xab0e // c2 = tan(3pi/16) |
65 |
mov r28 = 0xb505 // g4 = cos(4pi/16) |
66 |
mov r27 = 0xd4db // g3 = cos(3pi/16) |
67 |
mov r26 = 0xec83 // g2 = cos(2pi/16) |
68 |
mov r25 = 0xfb15 // g1 = cos(1pi/16) |
69 |
mov r24 = 0x0002 // correction bit for descaling |
70 |
mov r23 = 0x0004 // correction bit for descaling |
71 |
|
72 |
// Load Matrix into registers |
73 |
|
74 |
add loc0 = r0, r32 |
75 |
add loc2 = 16, r32 |
76 |
add loc4 = 32, r32 |
77 |
add loc6 = 48, r32 |
78 |
add loc8 = 64, r32 |
79 |
add loc10 = 80, r32 |
80 |
add loc12 = 96, r32 |
81 |
add loc14 = 112, r32 |
82 |
add loc1 = 8, r32 |
83 |
add loc3 = 24, r32 |
84 |
add loc5 = 40, r32 |
85 |
add loc7 = 56, r32 |
86 |
add loc9 = 72, r32 |
87 |
add loc11 = 88, r32 |
88 |
add loc13 = 104, r32 |
89 |
add loc15 = 120, r32 |
90 |
;; |
91 |
ld8 loc16 = [loc0] |
92 |
ld8 loc17 = [loc2] |
93 |
ld8 loc18 = [loc4] |
94 |
ld8 loc19 = [loc6] |
95 |
ld8 loc20 = [loc8] |
96 |
ld8 loc21 = [loc10] |
97 |
ld8 loc22 = [loc12] |
98 |
ld8 loc23 = [loc14] |
99 |
ld8 loc24 = [loc1] |
100 |
ld8 loc25 = [loc3] |
101 |
ld8 loc26 = [loc5] |
102 |
ld8 loc27 = [loc7] |
103 |
mux2 r26 = r26, 0x00 |
104 |
ld8 loc28 = [loc9] |
105 |
mux2 r31 = r31, 0x00 |
106 |
mux2 r25 = r25, 0x00 |
107 |
ld8 loc29 = [loc11] |
108 |
mux2 r30 = r30, 0x00 |
109 |
mux2 r29 = r29, 0x00 |
110 |
ld8 loc30 = [loc13] |
111 |
mux2 r28 = r28, 0x00 |
112 |
mux2 r27 = r27, 0x00 |
113 |
ld8 loc31 = [loc15] |
114 |
mux2 r24 = r24, 0x00 |
115 |
mux2 r23 = r23, 0x00 |
116 |
;; |
117 |
pshl2 loc16 = loc16, 3 |
118 |
pshl2 loc17 = loc17, 3 |
119 |
pshl2 loc18 = loc18, 3 |
120 |
pshl2 loc19 = loc19, 3 |
121 |
pshl2 loc20 = loc20, 3 |
122 |
pshl2 loc21 = loc21, 3 |
123 |
pshl2 loc22 = loc22, 3 |
124 |
pshl2 loc23 = loc23, 3 |
125 |
;; |
126 |
pshl2 loc24 = loc24, 3 |
127 |
|
128 |
// ******************* |
129 |
// column-DTC 1st half |
130 |
// ******************* |
131 |
|
132 |
psub2 loc37 = loc17, loc22 // t5 = x1 - x6 |
133 |
pshl2 loc25 = loc25, 3 |
134 |
pshl2 loc26 = loc26, 3 |
135 |
psub2 loc38 = loc18, loc21 // t6 = x2 - x5 |
136 |
pshl2 loc27 = loc27, 3 |
137 |
pshl2 loc28 = loc28, 3 |
138 |
;; |
139 |
padd2 loc32 = loc16, loc23 // t0 = x0 + x7 |
140 |
pshl2 loc29 = loc29, 3 |
141 |
pshl2 loc30 = loc30, 3 |
142 |
padd2 loc33 = loc17, loc22 // t1 = x1 + x6 |
143 |
padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
144 |
psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
145 |
;; |
146 |
padd2 loc34 = loc18, loc21 // t2 = x2 + x5 |
147 |
pshl2 loc31 = loc31, 3 |
148 |
padd2 loc35 = loc19, loc20 // t3 = x3 + x4 |
149 |
psub2 loc36 = loc16, loc23 // t4 = x0 - x7 |
150 |
pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
151 |
pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
152 |
;; |
153 |
psub2 loc39 = loc19, loc20 // t7 = x3 - x4 |
154 |
padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
155 |
padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
156 |
|
157 |
padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
158 |
padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
159 |
psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
160 |
;; |
161 |
psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
162 |
padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
163 |
padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
164 |
psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
165 |
psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
166 |
;; |
167 |
pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
168 |
padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
169 |
pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
170 |
pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
171 |
psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
172 |
pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
173 |
pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
174 |
;; |
175 |
padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
176 |
pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
177 |
padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
178 |
padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
179 |
psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
180 |
psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
181 |
;; |
182 |
padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
183 |
padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) |
184 |
pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
185 |
pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
186 |
psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 |
187 |
;; |
188 |
padd2 loc48 = loc16, loc32 // y0 = x0 + t0 |
189 |
pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
190 |
pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
191 |
padd2 loc52 = loc17, loc33 // y4 = x1 + t1 |
192 |
pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
193 |
pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
194 |
;; |
195 |
padd2 loc50 = loc18, loc34 // y2 = x2 + t2 |
196 |
pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
197 |
pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
198 |
padd2 loc55 = loc21, loc37 // y7 = x5 + t5 |
199 |
padd2 loc49 = loc20, loc36 // y1 = x4 + t4 |
200 |
padd2 loc54 = loc19, loc35 // y6 = x3 + t3 |
201 |
;; |
202 |
padd2 loc51 = loc22, loc38 // y3 = x6 + t6 |
203 |
padd2 loc53 = loc23, loc39 // y5 = x7 + t7 |
204 |
|
205 |
//divide by 4 |
206 |
|
207 |
padd2 loc48 = loc48, r24 |
208 |
padd2 loc49 = loc49, r24 |
209 |
padd2 loc50 = loc50, r24 |
210 |
padd2 loc52 = loc52, r24 |
211 |
;; |
212 |
padd2 loc51 = loc51, r24 |
213 |
pshr2 loc48 = loc48, 2 |
214 |
padd2 loc53 = loc53, r24 |
215 |
pshr2 loc49 = loc49, 2 |
216 |
padd2 loc54 = loc54, r24 |
217 |
pshr2 loc50 = loc50, 2 |
218 |
padd2 loc55 = loc55, r24 |
219 |
pshr2 loc52 = loc52, 2 |
220 |
;; |
221 |
pshr2 loc51 = loc51, 2 |
222 |
pshr2 loc53 = loc53, 2 |
223 |
pshr2 loc54 = loc54, 2 |
224 |
pshr2 loc55 = loc55, 2 |
225 |
|
226 |
|
227 |
// ******************* |
228 |
// column-DTC 2nd half |
229 |
// ******************* |
230 |
|
231 |
psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 |
232 |
psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 |
233 |
padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 |
234 |
padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 |
235 |
;; |
236 |
padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 |
237 |
psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
238 |
padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
239 |
padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 |
240 |
;; |
241 |
psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 |
242 |
pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
243 |
pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
244 |
;; |
245 |
psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 |
246 |
padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
247 |
padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
248 |
|
249 |
padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
250 |
padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
251 |
psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
252 |
;; |
253 |
psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
254 |
padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
255 |
padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
256 |
psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
257 |
psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
258 |
;; |
259 |
pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
260 |
padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
261 |
pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
262 |
pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
263 |
psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
264 |
pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
265 |
pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
266 |
pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
267 |
;; |
268 |
padd2 loc34 = loc18, loc43 // t2 = x2 + buf3 |
269 |
padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
270 |
padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
271 |
psub2 loc35 = loc42, loc19 // t3 = buf2 - x3 |
272 |
padd2 loc36 = loc20, loc45 // t4 = x4 + buf5 |
273 |
pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
274 |
;; |
275 |
psub2 loc37 = loc44, loc21 // t5 = buf4 - x5 |
276 |
padd2 loc38 = loc22, loc47 // t6 = x6 + buf7 |
277 |
psub2 loc39 = loc46, loc23 // t7 = buf6 - x7 |
278 |
pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
279 |
;; |
280 |
padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 |
281 |
pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
282 |
pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
283 |
padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 |
284 |
pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
285 |
pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
286 |
;; |
287 |
padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 |
288 |
pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
289 |
pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
290 |
padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 |
291 |
padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 |
292 |
padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 |
293 |
;; |
294 |
padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 |
295 |
|
296 |
// ******************* |
297 |
// transpose matrix |
298 |
// ******************* |
299 |
|
300 |
mix2.r loc32 = loc48, loc49 // tmp0 = mixr y0, y1 |
301 |
mix2.l loc33 = loc48, loc49 // tmp1 = mixl y0, y1 |
302 |
padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 |
303 |
mix2.r loc34 = loc50, loc51 // tmp2 = mixr y2, y3 |
304 |
mix2.l loc35 = loc50, loc51 // tmp3 = mixl y2, y3 |
305 |
;; |
306 |
|
307 |
//divide by 4 |
308 |
|
309 |
padd2 loc40 = loc40, r24 |
310 |
padd2 loc41 = loc41, r24 |
311 |
mix4.r loc16 = loc32, loc34 // x0 = mixr tmp0, tmp2 |
312 |
padd2 loc42 = loc42, r24 |
313 |
padd2 loc43 = loc43, r24 |
314 |
mix4.r loc17 = loc33, loc35 // x1 = mixr tmp1, tmp3 |
315 |
padd2 loc44 = loc44, r24 |
316 |
padd2 loc45 = loc45, r24 |
317 |
mix4.l loc18 = loc32, loc34 // x2 = mixl tmp0, tmp2 |
318 |
padd2 loc46 = loc46, r24 |
319 |
padd2 loc47 = loc47, r24 |
320 |
mix4.l loc19 = loc33, loc35 // x3 = mixl tmp1, tmp3 |
321 |
;; |
322 |
pshr2 loc40 = loc40, 2 |
323 |
pshr2 loc41 = loc41, 2 |
324 |
pshr2 loc42 = loc42, 2 |
325 |
pshr2 loc43 = loc43, 2 |
326 |
mix2.r loc32 = loc52, loc53 // tmp0 = mixr y4, y5 |
327 |
mix2.l loc33 = loc52, loc53 // tmp1 = mixl y4, y5 |
328 |
mix2.r loc34 = loc54, loc55 // tmp2 = mixr y6, y7 |
329 |
mix2.l loc35 = loc54, loc55 // tmp3 = mixl y6, y7 |
330 |
;; |
331 |
pshr2 loc44 = loc44, 2 |
332 |
pshr2 loc45 = loc45, 2 |
333 |
pshr2 loc46 = loc46, 2 |
334 |
pshr2 loc47 = loc47, 2 |
335 |
mix4.r loc24 = loc32, loc34 // x0.2 = mixr tmp0, tmp2 |
336 |
mix4.r loc25 = loc33, loc35 // x1.2 = mixr tmp1, tmp3 |
337 |
mix4.l loc26 = loc32, loc34 // x2.2 = mixl tmp0, tmp2 |
338 |
mix4.l loc27 = loc33, loc35 // x3.2 = mixl tmp1, tmp3 |
339 |
;; |
340 |
mix2.r loc32 = loc40, loc41 // tmp0 = mixr y0.2, y1.2 |
341 |
mix2.l loc33 = loc40, loc41 // tmp1 = mixl y0.2, y1.2 |
342 |
mix2.r loc34 = loc42, loc43 // tmp2 = mixr y2.2, y3.2 |
343 |
mix2.l loc35 = loc42, loc43 // tmp3 = mixl y2.2, y3.2 |
344 |
;; |
345 |
mix4.r loc20 = loc32, loc34 // x4 = mixr tmp0, tmp2 |
346 |
mix4.r loc21 = loc33, loc35 // x5 = mixr tmp1, tmp3 |
347 |
mix4.l loc22 = loc32, loc34 // x6 = mixl tmp0, tmp2 |
348 |
mix4.l loc23 = loc33, loc35 // x7 = mixl tmp1, tmp3 |
349 |
;; |
350 |
mix2.r loc32 = loc44, loc45 // tmp0 = mixr y4.2, y5.2 |
351 |
mix2.l loc33 = loc44, loc45 // tmp1 = mixl y4.2, y5.2 |
352 |
mix2.r loc34 = loc46, loc47 // tmp2 = mixr y6.2, y6.2 |
353 |
mix2.l loc35 = loc46, loc47 // tmp3 = mixl y6.2, y6.2 |
354 |
;; |
355 |
mix4.r loc28 = loc32, loc34 // x4.2 = mixr tmp0, tmp2 |
356 |
mix4.r loc29 = loc33, loc35 // x5.2 = mixr tmp1, tmp3 |
357 |
mix4.l loc30 = loc32, loc34 // x6.2 = mixl tmp0, tmp2 |
358 |
mix4.l loc31 = loc33, loc35 // x7.2 = mixl tmp1, tmp3 |
359 |
|
360 |
// ******************* |
361 |
// row-DTC 1st half |
362 |
// ******************* |
363 |
|
364 |
psub2 loc37 = loc17, loc22 // t5 = x1 - x6 |
365 |
psub2 loc38 = loc18, loc21 // t6 = x2 - x5 |
366 |
;; |
367 |
padd2 loc32 = loc16, loc23 // t0 = x0 + x7 |
368 |
padd2 loc33 = loc17, loc22 // t1 = x1 + x6 |
369 |
padd2 loc34 = loc18, loc21 // t2 = x2 + x5 |
370 |
psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
371 |
padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
372 |
padd2 loc35 = loc19, loc20 // t3 = x3 + x4 |
373 |
;; |
374 |
psub2 loc36 = loc16, loc23 // t4 = x0 - x7 |
375 |
pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
376 |
pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
377 |
;; |
378 |
psub2 loc39 = loc19, loc20 // t7 = x3 - x4 |
379 |
padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
380 |
padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
381 |
|
382 |
padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
383 |
padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
384 |
psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
385 |
;; |
386 |
psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
387 |
padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
388 |
padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
389 |
psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
390 |
psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
391 |
;; |
392 |
pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
393 |
padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
394 |
pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
395 |
pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
396 |
psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
397 |
pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
398 |
pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
399 |
pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
400 |
;; |
401 |
padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
402 |
padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
403 |
padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
404 |
;; |
405 |
psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
406 |
padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
407 |
psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
408 |
padd2 loc38 = loc22, loc47 // t6 = x6 + (buf7 * c1) |
409 |
psub2 loc39 = loc46, loc23 // t7 = (c1 * buf6) - x7 |
410 |
pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
411 |
pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
412 |
;; |
413 |
padd2 loc48 = loc16, loc32 // y0 = x0 + t0 |
414 |
pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
415 |
pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
416 |
padd2 loc52 = loc17, loc33 // y4 = x1 + t1 |
417 |
pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
418 |
pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
419 |
;; |
420 |
padd2 loc50 = loc18, loc34 // y2 = x2 + t2 |
421 |
pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
422 |
pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
423 |
padd2 loc55 = loc21, loc37 // y7 = x5 + t5 |
424 |
padd2 loc49 = loc20, loc36 // y1 = x4 + t4 |
425 |
padd2 loc54 = loc19, loc35 // y6 = x3 + t3 |
426 |
;; |
427 |
padd2 loc51 = loc22, loc38 // y3 = x6 + t6 |
428 |
padd2 loc53 = loc23, loc39 // y5 = x7 + t7 |
429 |
|
430 |
// ******************* |
431 |
// row-DTC 2nd half |
432 |
// ******************* |
433 |
|
434 |
psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 |
435 |
psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 |
436 |
padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 |
437 |
padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 |
438 |
;; |
439 |
padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 |
440 |
psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
441 |
padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
442 |
padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 |
443 |
;; |
444 |
psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 |
445 |
pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
446 |
pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
447 |
;; |
448 |
psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 |
449 |
padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
450 |
padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
451 |
|
452 |
padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
453 |
padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
454 |
psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
455 |
;; |
456 |
psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
457 |
padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
458 |
padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
459 |
psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
460 |
psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
461 |
;; |
462 |
pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
463 |
padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
464 |
pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
465 |
pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
466 |
psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
467 |
pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
468 |
pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
469 |
pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
470 |
;; |
471 |
padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
472 |
padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
473 |
padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
474 |
;; |
475 |
psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
476 |
padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
477 |
psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
478 |
padd2 loc38 = loc22, loc47 // t6 = x6 + (buf7 * c1) |
479 |
psub2 loc39 = loc46, loc23 // t7 = (c1 * buf6) - x7 |
480 |
pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
481 |
pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
482 |
;; |
483 |
padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 |
484 |
pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
485 |
pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
486 |
padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 |
487 |
pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
488 |
pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
489 |
;; |
490 |
padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 |
491 |
pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
492 |
pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
493 |
padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 |
494 |
nop.i 0x0 |
495 |
nop.i 0x0 |
496 |
;; |
497 |
|
498 |
// ******************* |
499 |
// Transpose matrix |
500 |
// ******************* |
501 |
padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 |
502 |
mix2.l loc32 = loc49, loc48 // tmp0 = mixr y1, y0 |
503 |
mix2.r loc33 = loc49, loc48 // tmp1 = mixl y1, y0 |
504 |
padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 |
505 |
mix2.l loc34 = loc51, loc50 // tmp2 = mixr y3, y2 |
506 |
mix2.r loc35 = loc51, loc50 // tmp3 = mixl y3, y2 |
507 |
;; |
508 |
padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 |
509 |
mix4.l loc16 = loc34, loc32 // x0 = mixr tmp2, tmp0 |
510 |
mix4.l loc17 = loc35, loc33 // x1 = mixr tmp3, tmp1 |
511 |
padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 |
512 |
mix4.r loc18 = loc34, loc32 // x2 = mixl tmp2, tmp0 |
513 |
mix4.r loc19 = loc35, loc33 // x3 = mixl tmp3, tmp1 |
514 |
;; |
515 |
padd2 loc16 = loc16, r23 |
516 |
mix2.l loc32 = loc41, loc40 // tmp0 = mixr y0.2, y1.2 |
517 |
mix2.r loc33 = loc41, loc40 // tmp1 = mixl y0.2, y1.2 |
518 |
padd2 loc17 = loc17, r23 |
519 |
mix2.l loc34 = loc43, loc42 // tmp2 = mixr y2.2, y3.2 |
520 |
mix2.r loc35 = loc43, loc42 // tmp3 = mixl y2.2, y3.2 |
521 |
;; |
522 |
padd2 loc18 = loc18, r23 |
523 |
mix4.l loc20 = loc34, loc32 // x4 = mixr tmp2, tmp0 |
524 |
mix4.l loc21 = loc35, loc33 // x5 = mixr tmp3, tmp1 |
525 |
padd2 loc19 = loc19, r23 |
526 |
mix4.r loc22 = loc34, loc32 // x6 = mixl tmp2, tmp0 |
527 |
mix4.r loc23 = loc35, loc33 // x7 = mixl tmp3, tmp1 |
528 |
;; |
529 |
padd2 loc20 = loc20, r23 |
530 |
mix2.l loc32 = loc53, loc52 // tmp0 = mixr y5, y4 |
531 |
mix2.r loc33 = loc53, loc52 // tmp1 = mixl y5, y4 |
532 |
padd2 loc21 = loc21, r23 |
533 |
mix2.l loc34 = loc55, loc54 // tmp2 = mixr y7, y6 |
534 |
mix2.r loc35 = loc55, loc54 // tmp3 = mixl y7, y6 |
535 |
;; |
536 |
padd2 loc22 = loc22, r23 |
537 |
mix4.l loc24 = loc34, loc32 // x0.2 = mixr tmp2, tmp0 |
538 |
mix4.l loc25 = loc35, loc33 // x1.2 = mixr tmp3, tmp1 |
539 |
padd2 loc23 = loc23, r23 |
540 |
mix4.r loc26 = loc34, loc32 // x2.2 = mixl tmp2, tmp0 |
541 |
mix4.r loc27 = loc35, loc33 // x3.2 = mixl tmp3, tmp1 |
542 |
;; |
543 |
padd2 loc24 = loc24, r23 |
544 |
mix2.l loc32 = loc45, loc44 // tmp0 = mixr y4.2, y5.2 |
545 |
mix2.r loc33 = loc45, loc44 // tmp1 = mixl y4.2, y5.2 |
546 |
padd2 loc25 = loc25, r23 |
547 |
mix2.l loc34 = loc47, loc46 // tmp2 = mixr y6.2, y6.2 |
548 |
mix2.r loc35 = loc47, loc46 // tmp3 = mixl y6.2, y6.2 |
549 |
;; |
550 |
padd2 loc26 = loc26, r23 |
551 |
mix4.l loc28 = loc34, loc32 // x4.2 = mixr tmp2, tmp0 |
552 |
mix4.l loc29 = loc35, loc33 // x5.2 = mixr tmp3, tmp1 |
553 |
padd2 loc27 = loc27, r23 |
554 |
mix4.r loc30 = loc34, loc32 // x6.2 = mixl tmp2, tmp0 |
555 |
mix4.r loc31 = loc35, loc33 // x7.2 = mixl tmp3, tmp1 |
556 |
;; |
557 |
// ******************* |
558 |
// Descale |
559 |
// ******************* |
560 |
padd2 loc28 = loc28, r23 |
561 |
pshr2 loc16 = loc16, 3 |
562 |
pshr2 loc17 = loc17, 3 |
563 |
padd2 loc29 = loc29, r23 |
564 |
pshr2 loc18 = loc18, 3 |
565 |
pshr2 loc19 = loc19, 3 |
566 |
padd2 loc30 = loc30, r23 |
567 |
pshr2 loc20 = loc20, 3 |
568 |
pshr2 loc21 = loc21, 3 |
569 |
padd2 loc31 = loc31, r23 |
570 |
pshr2 loc22 = loc22, 3 |
571 |
pshr2 loc23 = loc23, 3 |
572 |
;; |
573 |
pshr2 loc24 = loc24, 3 |
574 |
pshr2 loc25 = loc25, 3 |
575 |
pshr2 loc26 = loc26, 3 |
576 |
pshr2 loc27 = loc27, 3 |
577 |
pshr2 loc28 = loc28, 3 |
578 |
pshr2 loc29 = loc29, 3 |
579 |
pshr2 loc30 = loc30, 3 |
580 |
pshr2 loc31 = loc31, 3 |
581 |
;; |
582 |
// ******************* |
583 |
// Store matrix |
584 |
// ******************* |
585 |
st8 [loc0] = loc16 |
586 |
st8 [loc1] = loc24 |
587 |
st8 [loc2] = loc17 |
588 |
st8 [loc3] = loc25 |
589 |
st8 [loc4] = loc18 |
590 |
st8 [loc5] = loc26 |
591 |
st8 [loc6] = loc19 |
592 |
st8 [loc7] = loc27 |
593 |
st8 [loc8] = loc20 |
594 |
st8 [loc9] = loc28 |
595 |
st8 [loc10] = loc21 |
596 |
st8 [loc11] = loc29 |
597 |
st8 [loc12] = loc22 |
598 |
st8 [loc13] = loc30 |
599 |
st8 [loc14] = loc23 |
600 |
st8 [loc15] = loc31 |
601 |
|
602 |
mov ar.pfs = r14 |
603 |
br.ret.sptk.many b0 |
604 |
.endp fdct_ia64# |
605 |
.common fdct#,8,8 |
606 |
|
607 |
|
608 |
|
609 |
|
610 |
|
611 |
|
612 |
|
613 |
|
614 |
//*********************************************** |
615 |
//* Here is a version of the DCT implementation * |
616 |
//* unoptimized in terms of command ordering. * |
617 |
//* This version is about 30% slower but * |
618 |
//* easier understand. * |
619 |
//*********************************************** |
620 |
// |
621 |
// .pred.safe_across_calls p1-p5,p16-p63 |
622 |
//.text |
623 |
// .align 16 |
624 |
// .global fdct_ia64# |
625 |
// .proc fdct_ia64# |
626 |
//fdct_ia64: |
627 |
// .prologue |
628 |
// alloc r14 = ar.pfs, 1, 56, 0, 0 |
629 |
// |
630 |
// // ******************* |
631 |
// // Save constants |
632 |
// // ******************* |
633 |
// mov r31 = 0x32ec // c0 = tan(1pi/16) |
634 |
// mov r30 = 0x6a0a // c1 = tan(2pi/16) |
635 |
// mov r29 = 0xab0e // c2 = tan(3pi/16) |
636 |
// mov r28 = 0xb505 // g4 = cos(4pi/16) |
637 |
// mov r27 = 0xd4db // g3 = cos(3pi/16) |
638 |
// mov r26 = 0xec83 // g2 = cos(2pi/16) |
639 |
// mov r25 = 0xfb15 // g1 = cos(1pi/16) |
640 |
// mov r24 = 0x0002 // correction bit for descaling |
641 |
// mov r23 = 0x0004 // correction bit for descaling |
642 |
// |
643 |
// // ************************** |
644 |
// // Load Matrix into registers |
645 |
// // ************************** |
646 |
// |
647 |
// add loc0 = r0, r32 |
648 |
// ;; |
649 |
// mux2 r31 = r31, 0x00 |
650 |
// mux2 r30 = r30, 0x00 |
651 |
// mux2 r29 = r29, 0x00 |
652 |
// mux2 r28 = r28, 0x00 |
653 |
// mux2 r27 = r27, 0x00 |
654 |
// mux2 r26 = r26, 0x00 |
655 |
// mux2 r25 = r25, 0x00 |
656 |
// mux2 r24 = r24, 0x00 |
657 |
// mux2 r23 = r23, 0x00 |
658 |
// ld8 loc16 = [loc0] |
659 |
// add loc2 = 16, r32 |
660 |
// add loc4 = 32, r32 |
661 |
// add loc6 = 48, r32 |
662 |
// add loc8 = 64, r32 |
663 |
// add loc10 = 80, r32 |
664 |
// ;; |
665 |
// ld8 loc17 = [loc2] |
666 |
// ld8 loc18 = [loc4] |
667 |
// add loc12 = 96, r32 |
668 |
// ld8 loc19 = [loc6] |
669 |
// ld8 loc20 = [loc8] |
670 |
// add loc14 = 112, r32 |
671 |
// ;; |
672 |
// ld8 loc21 = [loc10] |
673 |
// ld8 loc22 = [loc12] |
674 |
// add loc1 = 8, r32 |
675 |
// ld8 loc23 = [loc14] |
676 |
// add loc3 = 24, r32 |
677 |
// add loc5 = 40, r32 |
678 |
// ;; |
679 |
// ld8 loc24 = [loc1] |
680 |
// ld8 loc25 = [loc3] |
681 |
// add loc7 = 56, r32 |
682 |
// ld8 loc26 = [loc5] |
683 |
// add loc9 = 72, r32 |
684 |
// add loc11 = 88, r32 |
685 |
// ;; |
686 |
// ld8 loc27 = [loc7] |
687 |
// ld8 loc28 = [loc9] |
688 |
// add loc13 = 104, r32 |
689 |
// ld8 loc29 = [loc11] |
690 |
// add loc15 = 120, r32 |
691 |
// ;; |
692 |
// ld8 loc30 = [loc13] |
693 |
// ld8 loc31 = [loc15] |
694 |
// ;; |
695 |
// // ****** |
696 |
// // Scale |
697 |
// // ****** |
698 |
// pshl2 loc16 = loc16, 3 |
699 |
// pshl2 loc17 = loc17, 3 |
700 |
// pshl2 loc18 = loc18, 3 |
701 |
// pshl2 loc19 = loc19, 3 |
702 |
// pshl2 loc20 = loc20, 3 |
703 |
// pshl2 loc21 = loc21, 3 |
704 |
// pshl2 loc22 = loc22, 3 |
705 |
// pshl2 loc23 = loc23, 3 |
706 |
// pshl2 loc24 = loc24, 3 |
707 |
// pshl2 loc25 = loc25, 3 |
708 |
// pshl2 loc26 = loc26, 3 |
709 |
// pshl2 loc27 = loc27, 3 |
710 |
// pshl2 loc28 = loc28, 3 |
711 |
// pshl2 loc29 = loc29, 3 |
712 |
// pshl2 loc30 = loc30, 3 |
713 |
// pshl2 loc31 = loc31, 3 |
714 |
// ;; |
715 |
// |
716 |
// // ******************* |
717 |
// // column-DTC 1st half |
718 |
// // ******************* |
719 |
// |
720 |
// padd2 loc32 = loc16, loc23 // t0 = x0 + x7 |
721 |
// padd2 loc33 = loc17, loc22 // t1 = x1 + x6 |
722 |
// padd2 loc34 = loc18, loc21 // t2 = x2 + x5 |
723 |
// padd2 loc35 = loc19, loc20 // t3 = x3 + x4 |
724 |
// psub2 loc36 = loc16, loc23 // t4 = x0 - x7 |
725 |
// psub2 loc37 = loc17, loc22 // t5 = x1 - x6 |
726 |
// psub2 loc38 = loc18, loc21 // t6 = x2 - x5 |
727 |
// psub2 loc39 = loc19, loc20 // t7 = x3 - x4 |
728 |
// ;; |
729 |
// padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
730 |
// psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
731 |
// ;; |
732 |
// pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
733 |
// pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
734 |
// ;; |
735 |
// padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
736 |
// padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
737 |
// ;; |
738 |
// padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
739 |
// padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
740 |
// psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
741 |
// psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
742 |
// padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
743 |
// padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
744 |
// psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
745 |
// psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
746 |
// ;; |
747 |
// |
748 |
// padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
749 |
// psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
750 |
// pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
751 |
// pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
752 |
// pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
753 |
// pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
754 |
// pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
755 |
// pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
756 |
// ;; |
757 |
// padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
758 |
// padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
759 |
// ;; |
760 |
// padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
761 |
// psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
762 |
// padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
763 |
// psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
764 |
// padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) |
765 |
// psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 |
766 |
// ;; |
767 |
// pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
768 |
// pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
769 |
// pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
770 |
// pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
771 |
// pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
772 |
// pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
773 |
// pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
774 |
// pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
775 |
// ;; |
776 |
// padd2 loc48 = loc16, loc32 // y0 = x0 + t0 |
777 |
// padd2 loc49 = loc20, loc36 // y1 = x4 + t4 |
778 |
// padd2 loc50 = loc18, loc34 // y2 = x2 + t2 |
779 |
// padd2 loc51 = loc22, loc38 // y3 = x6 + t6 |
780 |
// padd2 loc52 = loc17, loc33 // y4 = x1 + t1 |
781 |
// padd2 loc53 = loc23, loc39 // y5 = x7 + t7 |
782 |
// padd2 loc54 = loc19, loc35 // y6 = x3 + t3 |
783 |
// padd2 loc55 = loc21, loc37 // y7 = x5 + t5 |
784 |
// ;; |
785 |
// |
786 |
// // ******************* |
787 |
// // column-DTC 2nd half |
788 |
// // ******************* |
789 |
// |
790 |
// padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 |
791 |
// padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 |
792 |
// padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 |
793 |
// padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 |
794 |
// psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 |
795 |
// psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 |
796 |
// psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 |
797 |
// psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 |
798 |
// ;; |
799 |
// padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
800 |
// psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
801 |
// ;; |
802 |
// pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
803 |
// pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
804 |
// ;; |
805 |
// padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
806 |
// padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
807 |
// ;; |
808 |
// padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
809 |
// padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
810 |
// psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
811 |
// psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
812 |
// padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
813 |
// padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
814 |
// psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
815 |
// psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
816 |
// ;; |
817 |
// padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
818 |
// psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
819 |
// pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
820 |
// pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
821 |
// pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
822 |
// pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
823 |
// pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
824 |
// pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
825 |
// ;; |
826 |
// padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
827 |
// padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
828 |
// ;; |
829 |
// padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
830 |
// psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
831 |
// padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
832 |
// psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
833 |
// padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) |
834 |
// psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 |
835 |
// ;; |
836 |
// pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
837 |
// pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
838 |
// pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
839 |
// pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
840 |
// pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
841 |
// pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
842 |
// pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
843 |
// pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
844 |
// ;; |
845 |
// padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 |
846 |
// padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 |
847 |
// padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 |
848 |
// padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 |
849 |
// padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 |
850 |
// padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 |
851 |
// padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 |
852 |
// padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 |
853 |
// ;; |
854 |
// padd2 loc40 = loc40, r24 // add r24 to correct rounding |
855 |
// padd2 loc41 = loc41, r24 |
856 |
// padd2 loc42 = loc42, r24 |
857 |
// padd2 loc43 = loc43, r24 |
858 |
// padd2 loc44 = loc44, r24 |
859 |
// padd2 loc45 = loc45, r24 |
860 |
// padd2 loc46 = loc46, r24 |
861 |
// padd2 loc47 = loc47, r24 |
862 |
// padd2 loc48 = loc48, r24 |
863 |
// padd2 loc49 = loc49, r24 |
864 |
// padd2 loc50 = loc50, r24 |
865 |
// padd2 loc51 = loc51, r24 |
866 |
// padd2 loc52 = loc52, r24 |
867 |
// padd2 loc53 = loc53, r24 |
868 |
// padd2 loc54 = loc54, r24 |
869 |
// padd2 loc55 = loc55, r24 |
870 |
// ;; |
871 |
// pshr2 loc40 = loc40, 2 // Divide all matrix elements through 4 |
872 |
// pshr2 loc41 = loc41, 2 |
873 |
// pshr2 loc42 = loc42, 2 |
874 |
// pshr2 loc43 = loc43, 2 |
875 |
// pshr2 loc44 = loc44, 2 |
876 |
// pshr2 loc45 = loc45, 2 |
877 |
// pshr2 loc46 = loc46, 2 |
878 |
// pshr2 loc47 = loc47, 2 |
879 |
// pshr2 loc48 = loc48, 2 |
880 |
// pshr2 loc49 = loc49, 2 |
881 |
// pshr2 loc50 = loc50, 2 |
882 |
// pshr2 loc51 = loc51, 2 |
883 |
// pshr2 loc52 = loc52, 2 |
884 |
// pshr2 loc53 = loc53, 2 |
885 |
// pshr2 loc54 = loc54, 2 |
886 |
// pshr2 loc55 = loc55, 2 |
887 |
// ;; |
888 |
// |
889 |
// // ***************** |
890 |
// // Transpose matrix |
891 |
// // ***************** |
892 |
// |
893 |
// mix2.r loc32 = loc48, loc49 // tmp0 = mixr y0, y1 |
894 |
// mix2.l loc33 = loc48, loc49 // tmp1 = mixl y0, y1 |
895 |
// mix2.r loc34 = loc50, loc51 // tmp2 = mixr y2, y3 |
896 |
// mix2.l loc35 = loc50, loc51 // tmp3 = mixl y2, y3 |
897 |
// ;; |
898 |
// mix4.r loc16 = loc32, loc34 // x0 = mixr tmp0, tmp2 |
899 |
// mix4.r loc17 = loc33, loc35 // x1 = mixr tmp1, tmp3 |
900 |
// mix4.l loc18 = loc32, loc34 // x2 = mixl tmp0, tmp2 |
901 |
// mix4.l loc19 = loc33, loc35 // x3 = mixl tmp1, tmp3 |
902 |
// ;; |
903 |
// mix2.r loc32 = loc40, loc41 // tmp0 = mixr y0.2, y1.2 |
904 |
// mix2.l loc33 = loc40, loc41 // tmp1 = mixl y0.2, y1.2 |
905 |
// mix2.r loc34 = loc42, loc43 // tmp2 = mixr y2.2, y3.2 |
906 |
// mix2.l loc35 = loc42, loc43 // tmp3 = mixl y2.2, y3.2 |
907 |
// ;; |
908 |
// mix4.r loc20 = loc32, loc34 // x4 = mixr tmp0, tmp2 |
909 |
// mix4.r loc21 = loc33, loc35 // x5 = mixr tmp1, tmp3 |
910 |
// mix4.l loc22 = loc32, loc34 // x6 = mixl tmp0, tmp2 |
911 |
// mix4.l loc23 = loc33, loc35 // x7 = mixl tmp1, tmp3 |
912 |
// ;; |
913 |
// mix2.r loc32 = loc52, loc53 // tmp0 = mixr y4, y5 |
914 |
// mix2.l loc33 = loc52, loc53 // tmp1 = mixl y4, y5 |
915 |
// mix2.r loc34 = loc54, loc55 // tmp2 = mixr y6, y7 |
916 |
// mix2.l loc35 = loc54, loc55 // tmp3 = mixl y6, y7 |
917 |
// ;; |
918 |
// mix4.r loc24 = loc32, loc34 // x0.2 = mixr tmp0, tmp2 |
919 |
// mix4.r loc25 = loc33, loc35 // x1.2 = mixr tmp1, tmp3 |
920 |
// mix4.l loc26 = loc32, loc34 // x2.2 = mixl tmp0, tmp2 |
921 |
// mix4.l loc27 = loc33, loc35 // x3.2 = mixl tmp1, tmp3 |
922 |
// ;; |
923 |
// mix2.r loc32 = loc44, loc45 // tmp0 = mixr y4.2, y5.2 |
924 |
// mix2.l loc33 = loc44, loc45 // tmp1 = mixl y4.2, y5.2 |
925 |
// mix2.r loc34 = loc46, loc47 // tmp2 = mixr y6.2, y6.2 |
926 |
// mix2.l loc35 = loc46, loc47 // tmp3 = mixl y6.2, y6.2 |
927 |
// ;; |
928 |
// mix4.r loc28 = loc32, loc34 // x4.2 = mixr tmp0, tmp2 |
929 |
// mix4.r loc29 = loc33, loc35 // x5.2 = mixr tmp1, tmp3 |
930 |
// mix4.l loc30 = loc32, loc34 // x6.2 = mixl tmp0, tmp2 |
931 |
// mix4.l loc31 = loc33, loc35 // x7.2 = mixl tmp1, tmp3 |
932 |
// ;; |
933 |
// |
934 |
// // ******************* |
935 |
// // row-DTC 1st half |
936 |
// // ******************* |
937 |
// |
938 |
// padd2 loc32 = loc16, loc23 // t0 = x0 + x7 |
939 |
// padd2 loc33 = loc17, loc22 // t1 = x1 + x6 |
940 |
// padd2 loc34 = loc18, loc21 // t2 = x2 + x5 |
941 |
// padd2 loc35 = loc19, loc20 // t3 = x3 + x4 |
942 |
// psub2 loc36 = loc16, loc23 // t4 = x0 - x7 |
943 |
// psub2 loc37 = loc17, loc22 // t5 = x1 - x6 |
944 |
// psub2 loc38 = loc18, loc21 // t6 = x2 - x5 |
945 |
// psub2 loc39 = loc19, loc20 // t7 = x3 - x4 |
946 |
// ;; |
947 |
// padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
948 |
// psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
949 |
// ;; |
950 |
// pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
951 |
// pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
952 |
// ;; |
953 |
// padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
954 |
// padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
955 |
// ;; |
956 |
// padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
957 |
// padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
958 |
// psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
959 |
// psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
960 |
// padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
961 |
// padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
962 |
// psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
963 |
// psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
964 |
// ;; |
965 |
// padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
966 |
// psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
967 |
// pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
968 |
// pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
969 |
// pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
970 |
// pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
971 |
// pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
972 |
// pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
973 |
// ;; |
974 |
// padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
975 |
// padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
976 |
// ;; |
977 |
// padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
978 |
// psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
979 |
// padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
980 |
// psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
981 |
// padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) |
982 |
// psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 |
983 |
// ;; |
984 |
// pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
985 |
// pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
986 |
// pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
987 |
// pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
988 |
// pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
989 |
// pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
990 |
// pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
991 |
// pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
992 |
// ;; |
993 |
// padd2 loc48 = loc16, loc32 // y0 = x0 + t0 |
994 |
// padd2 loc49 = loc20, loc36 // y1 = x4 + t4 |
995 |
// padd2 loc50 = loc18, loc34 // y2 = x2 + t2 |
996 |
// padd2 loc51 = loc22, loc38 // y3 = x6 + t6 |
997 |
// padd2 loc52 = loc17, loc33 // y4 = x1 + t1 |
998 |
// padd2 loc53 = loc23, loc39 // y5 = x7 + t7 |
999 |
// padd2 loc54 = loc19, loc35 // y6 = x3 + t3 |
1000 |
// padd2 loc55 = loc21, loc37 // y7 = x5 + t5 |
1001 |
// ;; |
1002 |
// |
1003 |
// // ******************* |
1004 |
// // row-DTC 2nd half |
1005 |
// // ******************* |
1006 |
// |
1007 |
// padd2 loc32 = loc24, loc31 // t0 = x0.2 + x7.2 |
1008 |
// padd2 loc33 = loc25, loc30 // t1 = x1.2 + x6.2 |
1009 |
// padd2 loc34 = loc26, loc29 // t2 = x2.2 + x5.2 |
1010 |
// padd2 loc35 = loc27, loc28 // t3 = x3.2 + x4.2 |
1011 |
// psub2 loc36 = loc24, loc31 // t4 = x0.2 - x7.2 |
1012 |
// psub2 loc37 = loc25, loc30 // t5 = x1.2 - x6.2 |
1013 |
// psub2 loc38 = loc26, loc29 // t6 = x2.2 - x5.2 |
1014 |
// psub2 loc39 = loc27, loc28 // t7 = x3.2 - x4.2 |
1015 |
// ;; |
1016 |
// padd2 loc40 = loc37, loc38 // buf0 = t5 + t6 |
1017 |
// psub2 loc41 = loc37, loc38 // buf1 = t5 - t6 |
1018 |
// ;; |
1019 |
// pmpyshr2 loc37 = loc40, r28, 16 // t5 = buf0 * g4 |
1020 |
// pmpyshr2 loc38 = loc41, r28, 16 // t6 = buf1 * g4 |
1021 |
// ;; |
1022 |
// padd2 loc37 = loc37, loc40 // t5 = t5 + buf1 |
1023 |
// padd2 loc38 = loc38, loc41 // t6 = t6 + buf2 |
1024 |
// ;; |
1025 |
// padd2 loc16 = loc32, loc35 // x0 = t0 + t3 |
1026 |
// padd2 loc17 = loc33, loc34 // x1 = t1 + t2 |
1027 |
// psub2 loc18 = loc32, loc35 // x2 = t0 - t3 |
1028 |
// psub2 loc19 = loc33, loc34 // x3 = t1 - t2 |
1029 |
// padd2 loc20 = loc36, loc37 // x4 = t4 + t5 |
1030 |
// padd2 loc21 = loc38, loc39 // x5 = t6 + t7 |
1031 |
// psub2 loc22 = loc36, loc37 // x6 = t4 - t5 |
1032 |
// psub2 loc23 = loc38, loc39 // x7 = t6 - t7 |
1033 |
// ;; |
1034 |
// padd2 loc32 = loc16, loc17 // t0 = x0 + x1 |
1035 |
// psub2 loc33 = loc16, loc17 // t1 = x0 - x1 |
1036 |
// pmpyshr2 loc42 = loc18, r30, 16 // buf2 = x2 * c1 |
1037 |
// pmpyshr2 loc43 = loc19, r30, 16 // buf3 = x3 * c1 |
1038 |
// pmpyshr2 loc44 = loc20, r31, 16 // buf4 = x4 * c0 |
1039 |
// pmpyshr2 loc45 = loc21, r31, 16 // buf5 = x5 * c0 |
1040 |
// pmpyshr2 loc46 = loc22, r29, 16 // buf6 = x6 * c2 |
1041 |
// pmpyshr2 loc47 = loc23, r29, 16 // buf7 = x7 * c2 |
1042 |
// ;; |
1043 |
// padd2 loc46 = loc46, loc22 // buf6 = buf6 + x6 |
1044 |
// padd2 loc47 = loc47, loc23 // buf7 = buf7 + x7 |
1045 |
// ;; |
1046 |
// padd2 loc34 = loc18, loc43 // t2 = x2 + (x3 * c1) |
1047 |
// psub2 loc35 = loc42, loc19 // t3 = (c1 * x2) - x3 |
1048 |
// padd2 loc36 = loc20, loc45 // t4 = x4 + (x5 * c1) |
1049 |
// psub2 loc37 = loc44, loc21 // t5 = (c1 * x4) - x5 |
1050 |
// padd2 loc38 = loc22, loc47 // t6 = x6 + (x7 * c1) |
1051 |
// psub2 loc39 = loc46, loc23 // t7 = (c1 * x6) - x7 |
1052 |
// ;; |
1053 |
// pmpyshr2 loc16 = loc32, r28, 16 // x0 = t0 * g4 |
1054 |
// pmpyshr2 loc17 = loc33, r28, 16 // x1 = t1 * g4 |
1055 |
// pmpyshr2 loc18 = loc34, r26, 16 // x2 = t2 * g2 |
1056 |
// pmpyshr2 loc19 = loc35, r26, 16 // x3 = t3 * g2 |
1057 |
// pmpyshr2 loc20 = loc36, r25, 16 // x4 = t4 * g1 |
1058 |
// pmpyshr2 loc21 = loc37, r25, 16 // x5 = t5 * g1 |
1059 |
// pmpyshr2 loc22 = loc38, r27, 16 // x6 = t6 * g3 |
1060 |
// pmpyshr2 loc23 = loc39, r27, 16 // x7 = t7 * g3 |
1061 |
// ;; |
1062 |
// padd2 loc40 = loc16, loc32 // y0.2 = x0 + t0 |
1063 |
// padd2 loc41 = loc20, loc36 // y1.2 = x4 + t4 |
1064 |
// padd2 loc42 = loc18, loc34 // y2.2 = x2 + t2 |
1065 |
// padd2 loc43 = loc22, loc38 // y3.2 = x6 + t6 |
1066 |
// padd2 loc44 = loc17, loc33 // y4.2 = x1 + t1 |
1067 |
// padd2 loc45 = loc23, loc39 // y5.2 = x7 + t7 |
1068 |
// padd2 loc46 = loc19, loc35 // y6.2 = x3 + t3 |
1069 |
// padd2 loc47 = loc21, loc37 // y7.2 = x5 + t5 |
1070 |
// ;; |
1071 |
// // ******************* |
1072 |
// // Transpose matrix |
1073 |
// // ******************* |
1074 |
// |
1075 |
// mix2.l loc32 = loc49, loc48 // tmp0 = mixr y1, y0 |
1076 |
// mix2.r loc33 = loc49, loc48 // tmp1 = mixl y1, y0 |
1077 |
// mix2.l loc34 = loc51, loc50 // tmp2 = mixr y3, y2 |
1078 |
// mix2.r loc35 = loc51, loc50 // tmp3 = mixl y3, y2 |
1079 |
// ;; |
1080 |
// mix4.l loc16 = loc34, loc32 // x0 = mixr tmp2, tmp0 |
1081 |
// mix4.l loc17 = loc35, loc33 // x1 = mixr tmp3, tmp1 |
1082 |
// mix4.r loc18 = loc34, loc32 // x2 = mixl tmp2, tmp0 |
1083 |
// mix4.r loc19 = loc35, loc33 // x3 = mixl tmp3, tmp1 |
1084 |
// ;; |
1085 |
// mix2.l loc32 = loc41, loc40 // tmp0 = mixr y0.2, y1.2 |
1086 |
// mix2.r loc33 = loc41, loc40 // tmp1 = mixl y0.2, y1.2 |
1087 |
// mix2.l loc34 = loc43, loc42 // tmp2 = mixr y2.2, y3.2 |
1088 |
// mix2.r loc35 = loc43, loc42 // tmp3 = mixl y2.2, y3.2 |
1089 |
// ;; |
1090 |
// mix4.l loc20 = loc34, loc32 // x4 = mixr tmp2, tmp0 |
1091 |
// mix4.l loc21 = loc35, loc33 // x5 = mixr tmp3, tmp1 |
1092 |
// mix4.r loc22 = loc34, loc32 // x6 = mixl tmp2, tmp0 |
1093 |
// mix4.r loc23 = loc35, loc33 // x7 = mixl tmp3, tmp1 |
1094 |
// ;; |
1095 |
// mix2.l loc32 = loc53, loc52 // tmp0 = mixr y5, y4 |
1096 |
// mix2.r loc33 = loc53, loc52 // tmp1 = mixl y5, y4 |
1097 |
// mix2.l loc34 = loc55, loc54 // tmp2 = mixr y7, y6 |
1098 |
// mix2.r loc35 = loc55, loc54 // tmp3 = mixl y7, y6 |
1099 |
// ;; |
1100 |
// mix4.l loc24 = loc34, loc32 // x0.2 = mixr tmp2, tmp0 |
1101 |
// mix4.l loc25 = loc35, loc33 // x1.2 = mixr tmp3, tmp1 |
1102 |
// mix4.r loc26 = loc34, loc32 // x2.2 = mixl tmp2, tmp0 |
1103 |
// mix4.r loc27 = loc35, loc33 // x3.2 = mixl tmp3, tmp1 |
1104 |
// ;; |
1105 |
// mix2.l loc32 = loc45, loc44 // tmp0 = mixr y4.2, y5.2 |
1106 |
// mix2.r loc33 = loc45, loc44 // tmp1 = mixl y4.2, y5.2 |
1107 |
// mix2.l loc34 = loc47, loc46 // tmp2 = mixr y6.2, y6.2 |
1108 |
// mix2.r loc35 = loc47, loc46 // tmp3 = mixl y6.2, y6.2 |
1109 |
// ;; |
1110 |
// mix4.l loc28 = loc34, loc32 // x4.2 = mixr tmp2, tmp0 |
1111 |
// mix4.l loc29 = loc35, loc33 // x5.2 = mixr tmp3, tmp1 |
1112 |
// mix4.r loc30 = loc34, loc32 // x6.2 = mixl tmp2, tmp0 |
1113 |
// mix4.r loc31 = loc35, loc33 // x7.2 = mixl tmp3, tmp1 |
1114 |
// ;; |
1115 |
// |
1116 |
// // ******** |
1117 |
// // descale |
1118 |
// // ******** |
1119 |
// |
1120 |
// padd2 loc16 = loc16, r23 |
1121 |
// padd2 loc17 = loc17, r23 |
1122 |
// padd2 loc18 = loc18, r23 |
1123 |
// padd2 loc19 = loc19, r23 |
1124 |
// padd2 loc20 = loc20, r23 |
1125 |
// padd2 loc21 = loc21, r23 |
1126 |
// padd2 loc22 = loc22, r23 |
1127 |
// padd2 loc23 = loc23, r23 |
1128 |
// padd2 loc24 = loc24, r23 |
1129 |
// padd2 loc25 = loc25, r23 |
1130 |
// padd2 loc26 = loc26, r23 |
1131 |
// padd2 loc27 = loc27, r23 |
1132 |
// padd2 loc28 = loc28, r23 |
1133 |
// padd2 loc29 = loc29, r23 |
1134 |
// padd2 loc30 = loc30, r23 |
1135 |
// padd2 loc31 = loc31, r23 |
1136 |
// ;; |
1137 |
// pshr2 loc16 = loc16, 3 |
1138 |
// pshr2 loc17 = loc17, 3 |
1139 |
// pshr2 loc18 = loc18, 3 |
1140 |
// pshr2 loc19 = loc19, 3 |
1141 |
// pshr2 loc20 = loc20, 3 |
1142 |
// pshr2 loc21 = loc21, 3 |
1143 |
// pshr2 loc22 = loc22, 3 |
1144 |
// pshr2 loc23 = loc23, 3 |
1145 |
// pshr2 loc24 = loc24, 3 |
1146 |
// pshr2 loc25 = loc25, 3 |
1147 |
// pshr2 loc26 = loc26, 3 |
1148 |
// pshr2 loc27 = loc27, 3 |
1149 |
// pshr2 loc28 = loc28, 3 |
1150 |
// pshr2 loc29 = loc29, 3 |
1151 |
// pshr2 loc30 = loc30, 3 |
1152 |
// pshr2 loc31 = loc31, 3 |
1153 |
// ;; |
1154 |
// // ************ |
1155 |
// // Store Matrix |
1156 |
// // ************ |
1157 |
// st8 [loc0] = loc16 |
1158 |
// st8 [loc1] = loc24 |
1159 |
// st8 [loc2] = loc17 |
1160 |
// st8 [loc3] = loc25 |
1161 |
// st8 [loc4] = loc18 |
1162 |
// st8 [loc5] = loc26 |
1163 |
// st8 [loc6] = loc19 |
1164 |
// st8 [loc7] = loc27 |
1165 |
// st8 [loc8] = loc20 |
1166 |
// st8 [loc9] = loc28 |
1167 |
// st8 [loc10] = loc21 |
1168 |
// st8 [loc11] = loc29 |
1169 |
// st8 [loc12] = loc22 |
1170 |
// st8 [loc13] = loc30 |
1171 |
// st8 [loc14] = loc23 |
1172 |
// st8 [loc15] = loc31 |
1173 |
// |
1174 |
// mov ar.pfs = r14 |
1175 |
// br.ret.sptk.many b0 |
1176 |
// .endp fdct_ia64# |
1177 |
// .common fdct#,8,8 |
1178 |
// |