Parent Directory | Revision Log
Revision 1.7 - (view) (download)
1 : | Isibaar | 1.7 | // **************************************************************************** |
2 : | // * | ||
3 : | // * XVID MPEG-4 VIDEO CODEC | ||
4 : | // * - IA64 h.263 quantization - | ||
5 : | // * | ||
6 : | // * Copyright(C) 2002 Christian Engel, Hans-Joachim Daniels | ||
7 : | // * | ||
8 : | // * This program is free software; you can redistribute it and/or modify it | ||
9 : | // * under the terms of the GNU General Public License as published by | ||
10 : | // * the Free Software Foundation; either version 2 of the License, or | ||
11 : | // * (at your option) any later version. | ||
12 : | // * | ||
13 : | // * This program is distributed in the hope that it will be useful, | ||
14 : | // * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 : | // * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 : | // * GNU General Public License for more details. | ||
17 : | // * | ||
18 : | // * You should have received a copy of the GNU General Public License | ||
19 : | // * along with this program; if not, write to the Free Software | ||
20 : | // * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
21 : | // * | ||
22 : | // * $Id: quant_h263_ia64.s,v 1.6 2008/12/04 14:41:50 Isibaar Exp $ | ||
23 : | // * | ||
24 : | // ***************************************************************************/ | ||
25 : | // | ||
26 : | // **************************************************************************** | ||
27 : | // * | ||
28 : | // * quant_h263_ia64.s, IA-64 h.263 quantization | ||
29 : | // * | ||
30 : | // * This version was implemented during an IA-64 practical training at | ||
31 : | // * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) | ||
32 : | // * | ||
33 : | // **************************************************************************** | ||
34 : | |||
35 : | // ***************************************************************************** | ||
36 : | // * | ||
37 : | // * functions quant_inter and dequant_inter have been softwarepipelined | ||
38 : | // * use was made of the pmpyshr2 instruction | ||
39 : | // * | ||
40 : | // * by Christian Engel and Hans-Joachim Daniels | ||
41 : | // * christian.engel@ira.uka.de hans-joachim.daniels@ira.uka.de | ||
42 : | // * | ||
43 : | // * This was made for the ia64 DivX laboratory (yes, it was really called | ||
44 : | // * this way, originally OpenDivX was intendet, but died shortly before our | ||
45 : | // * work started (you will probably already know ...)) | ||
46 : | // * at the Universitat Karlsruhe (TH) held between April and July 2002 | ||
47 : | // * http://www.info.uni-karlsruhe.de/~rubino/ia64p/ | ||
48 : | // * | ||
49 : | // *****************************************************************************/ | ||
50 : | |||
51 : | ia64p | 1.4 | .file "quant_h263_ia64.s" |
52 : | ia64p | 1.3 | .pred.safe_across_calls p1-p5,p16-p63 |
53 : | .section .rodata | ||
54 : | .align 4 | ||
55 : | .type multipliers#,@object | ||
56 : | .size multipliers#,128 | ||
57 : | multipliers: | ||
58 : | data4 0 | ||
59 : | data4 32769 | ||
60 : | data4 16385 | ||
61 : | data4 10923 | ||
62 : | data4 8193 | ||
63 : | data4 6554 | ||
64 : | data4 5462 | ||
65 : | data4 4682 | ||
66 : | data4 4097 | ||
67 : | data4 3641 | ||
68 : | data4 3277 | ||
69 : | data4 2979 | ||
70 : | data4 2731 | ||
71 : | data4 2521 | ||
72 : | data4 2341 | ||
73 : | data4 2185 | ||
74 : | data4 2049 | ||
75 : | data4 1928 | ||
76 : | data4 1821 | ||
77 : | data4 1725 | ||
78 : | data4 1639 | ||
79 : | data4 1561 | ||
80 : | data4 1490 | ||
81 : | data4 1425 | ||
82 : | data4 1366 | ||
83 : | data4 1311 | ||
84 : | data4 1261 | ||
85 : | data4 1214 | ||
86 : | data4 1171 | ||
87 : | data4 1130 | ||
88 : | data4 1093 | ||
89 : | data4 1058 | ||
90 : | .global __divdi3# | ||
91 : | .text | ||
92 : | .align 16 | ||
93 : | edgomez | 1.6 | .global quant_h263_intra_ia64# |
94 : | .proc quant_h263_intra_ia64# | ||
95 : | quant_h263_intra_ia64: | ||
96 : | ia64p | 1.4 | .prologue |
97 : | ia64p | 1.3 | .save ar.pfs, r38 |
98 : | alloc r38 = ar.pfs, 4, 3, 2, 0 | ||
99 : | adds r16 = -8, r12 | ||
100 : | .fframe 32 | ||
101 : | adds r12 = -32, r12 | ||
102 : | mov r17 = ar.lc | ||
103 : | addl r14 = @ltoff(multipliers#), gp | ||
104 : | ld2 r15 = [r33] | ||
105 : | ;; | ||
106 : | .savesp ar.lc, 24 | ||
107 : | st8 [r16] = r17, 8 | ||
108 : | ld8 r14 = [r14] | ||
109 : | sxt2 r15 = r15 | ||
110 : | ;; | ||
111 : | .save.f 0x1 | ||
112 : | stf.spill [r16] = f2 | ||
113 : | .save rp, r37 | ||
114 : | mov r37 = b0 | ||
115 : | .body | ||
116 : | dep.z r36 = r34, 1, 15 | ||
117 : | dep.z r16 = r34, 2, 32 | ||
118 : | cmp4.ge p6, p7 = 0, r15 | ||
119 : | ;; | ||
120 : | add r16 = r16, r14 | ||
121 : | ;; | ||
122 : | ld4 r16 = [r16] | ||
123 : | ;; | ||
124 : | setf.sig f2 = r16 | ||
125 : | (p6) br.cond.dptk .L8 | ||
126 : | extr r39 = r35, 1, 31 | ||
127 : | sxt4 r40 = r35 | ||
128 : | ;; | ||
129 : | add r39 = r39, r15 | ||
130 : | br .L21 | ||
131 : | ;; | ||
132 : | .L8: | ||
133 : | extr r39 = r35, 1, 31 | ||
134 : | sxt4 r40 = r35 | ||
135 : | ;; | ||
136 : | sub r39 = r15, r39 | ||
137 : | ;; | ||
138 : | .L21: | ||
139 : | sxt4 r39 = r39 | ||
140 : | br.call.sptk.many b0 = __divdi3# | ||
141 : | ;; | ||
142 : | addl r14 = 62, r0 | ||
143 : | st2 [r32] = r8 | ||
144 : | addl r19 = 1, r0 | ||
145 : | ;; | ||
146 : | mov ar.lc = r14 | ||
147 : | ;; | ||
148 : | .L20: | ||
149 : | dep.z r17 = r19, 1, 32 | ||
150 : | ;; | ||
151 : | add r15 = r17, r33 | ||
152 : | adds r19 = 1, r19 | ||
153 : | ;; | ||
154 : | ld2 r14 = [r15] | ||
155 : | ;; | ||
156 : | sxt2 r14 = r14 | ||
157 : | ;; | ||
158 : | mov r16 = r14 | ||
159 : | mov r18 = r14 | ||
160 : | ;; | ||
161 : | sub r15 = r0, r16 | ||
162 : | cmp4.le p8, p9 = r36, r16 | ||
163 : | cmp4.le p6, p7 = r0, r16 | ||
164 : | ;; | ||
165 : | sxt2 r14 = r15 | ||
166 : | (p6) br.cond.dptk .L14 | ||
167 : | ;; | ||
168 : | mov r16 = r14 | ||
169 : | add r18 = r17, r32 | ||
170 : | ;; | ||
171 : | setf.sig f6 = r16 | ||
172 : | cmp4.le p6, p7 = r36, r16 | ||
173 : | mov r15 = r18 | ||
174 : | ;; | ||
175 : | xma.l f6 = f6, f2, f0 | ||
176 : | (p7) st2 [r18] = r0 | ||
177 : | ;; | ||
178 : | getf.sig r14 = f6 | ||
179 : | ;; | ||
180 : | extr r14 = r14, 16, 16 | ||
181 : | ;; | ||
182 : | sub r14 = r0, r14 | ||
183 : | ;; | ||
184 : | (p6) st2 [r15] = r14 | ||
185 : | br .L12 | ||
186 : | .L14: | ||
187 : | ia64p | 1.4 | .pred.rel "mutex", p8, p9 |
188 : | ia64p | 1.3 | setf.sig f6 = r18 |
189 : | add r16 = r17, r32 | ||
190 : | ;; | ||
191 : | xma.l f6 = f6, f2, f0 | ||
192 : | mov r15 = r16 | ||
193 : | (p9) st2 [r16] = r0 | ||
194 : | ;; | ||
195 : | getf.sig r14 = f6 | ||
196 : | ;; | ||
197 : | extr r14 = r14, 16, 16 | ||
198 : | ;; | ||
199 : | (p8) st2 [r15] = r14 | ||
200 : | .L12: | ||
201 : | br.cloop.sptk.few .L20 | ||
202 : | adds r18 = 24, r12 | ||
203 : | ;; | ||
204 : | ld8 r19 = [r18], 8 | ||
205 : | mov ar.pfs = r38 | ||
206 : | mov b0 = r37 | ||
207 : | ;; | ||
208 : | mov ar.lc = r19 | ||
209 : | ldf.fill f2 = [r18] | ||
210 : | .restore sp | ||
211 : | adds r12 = 32, r12 | ||
212 : | br.ret.sptk.many b0 | ||
213 : | edgomez | 1.6 | .endp quant_h263_intra_ia64# |
214 : | .common quant_h263_intra#,8,8 | ||
215 : | .common dequant_h263_intra#,8,8 | ||
216 : | ia64p | 1.3 | .align 16 |
217 : | edgomez | 1.6 | .global dequant_h263_intra_ia64# |
218 : | .proc dequant_h263_intra_ia64# | ||
219 : | dequant_h263_intra_ia64: | ||
220 : | ia64p | 1.3 | .prologue |
221 : | ld2 r14 = [r33] | ||
222 : | andcm r15 = 1, r34 | ||
223 : | setf.sig f8 = r35 | ||
224 : | ;; | ||
225 : | sxt2 r14 = r14 | ||
226 : | sub r15 = r34, r15 | ||
227 : | addl r16 = -2048, r0 | ||
228 : | ;; | ||
229 : | setf.sig f6 = r14 | ||
230 : | setf.sig f7 = r15 | ||
231 : | shladd r34 = r34, 1, r0 | ||
232 : | ;; | ||
233 : | xma.l f8 = f6, f8, f0 | ||
234 : | .save ar.lc, r2 | ||
235 : | mov r2 = ar.lc | ||
236 : | ;; | ||
237 : | .body | ||
238 : | getf.sig r14 = f8 | ||
239 : | setf.sig f6 = r34 | ||
240 : | ;; | ||
241 : | sxt2 r15 = r14 | ||
242 : | st2 [r32] = r14 | ||
243 : | ;; | ||
244 : | cmp4.le p6, p7 = r16, r15 | ||
245 : | ;; | ||
246 : | (p7) st2 [r32] = r16 | ||
247 : | (p7) br.cond.dptk .L32 | ||
248 : | addl r14 = 2047, r0 | ||
249 : | ;; | ||
250 : | cmp4.ge p6, p7 = r14, r15 | ||
251 : | ;; | ||
252 : | (p7) st2 [r32] = r14 | ||
253 : | .L32: | ||
254 : | addl r14 = 62, r0 | ||
255 : | addl r19 = 1, r0 | ||
256 : | addl r22 = 2048, r0 | ||
257 : | addl r21 = -2048, r0 | ||
258 : | addl r20 = 2047, r0 | ||
259 : | ;; | ||
260 : | mov ar.lc = r14 | ||
261 : | ;; | ||
262 : | .L56: | ||
263 : | dep.z r16 = r19, 1, 32 | ||
264 : | ;; | ||
265 : | add r14 = r16, r33 | ||
266 : | add r17 = r16, r32 | ||
267 : | adds r19 = 1, r19 | ||
268 : | ;; | ||
269 : | ld2 r15 = [r14] | ||
270 : | ;; | ||
271 : | sxt2 r15 = r15 | ||
272 : | ;; | ||
273 : | cmp4.ne p6, p7 = 0, r15 | ||
274 : | cmp4.le p8, p9 = r0, r15 | ||
275 : | ;; | ||
276 : | (p7) st2 [r17] = r0 | ||
277 : | (p7) br.cond.dpnt .L36 | ||
278 : | add r18 = r16, r32 | ||
279 : | sub r17 = r0, r15 | ||
280 : | ;; | ||
281 : | mov r14 = r18 | ||
282 : | (p8) br.cond.dptk .L40 | ||
283 : | setf.sig f8 = r17 | ||
284 : | ;; | ||
285 : | xma.l f8 = f6, f8, f7 | ||
286 : | ;; | ||
287 : | getf.sig r15 = f8 | ||
288 : | ;; | ||
289 : | cmp4.lt p6, p7 = r22, r15 | ||
290 : | sub r16 = r0, r15 | ||
291 : | ;; | ||
292 : | (p7) st2 [r14] = r16 | ||
293 : | (p6) st2 [r14] = r21 | ||
294 : | br .L36 | ||
295 : | .L40: | ||
296 : | setf.sig f8 = r15 | ||
297 : | ;; | ||
298 : | xma.l f8 = f6, f8, f7 | ||
299 : | ;; | ||
300 : | getf.sig r15 = f8 | ||
301 : | ;; | ||
302 : | cmp4.le p6, p7 = r20, r15 | ||
303 : | ;; | ||
304 : | (p6) mov r14 = r20 | ||
305 : | (p7) mov r14 = r15 | ||
306 : | ;; | ||
307 : | st2 [r18] = r14 | ||
308 : | .L36: | ||
309 : | br.cloop.sptk.few .L56 | ||
310 : | ;; | ||
311 : | mov ar.lc = r2 | ||
312 : | br.ret.sptk.many b0 | ||
313 : | edgomez | 1.6 | .endp dequant_h263_intra_ia64# |
314 : | ia64p | 1.3 | |
315 : | |||
316 : | |||
317 : | edgomez | 1.6 | // uint32_t quant_h263_inter_ia64(int16_t *coeff, const int16_t *data, const uint32_t quant) |
318 : | ia64p | 1.3 | |
319 : | |||
320 : | |||
321 : | edgomez | 1.6 | .common quant_h263_inter#,8,8 |
322 : | ia64p | 1.3 | .align 16 |
323 : | edgomez | 1.6 | .global quant_h263_inter_ia64# |
324 : | .proc quant_h263_inter_ia64# | ||
325 : | quant_h263_inter_ia64: | ||
326 : | ia64p | 1.3 | |
327 : | |||
328 : | ia64p | 1.4 | //******************************************************* |
329 : | //* * | ||
330 : | //* const uint32_t mult = multipliers[quant]; * | ||
331 : | //* const uint16_t quant_m_2 = quant << 1; * | ||
332 : | //* const uint16_t quant_d_2 = quant >> 1; * | ||
333 : | //* int sum = 0; * | ||
334 : | //* uint32_t i; * | ||
335 : | //* int16_t acLevel,acL; * | ||
336 : | //* * | ||
337 : | //*******************************************************/ | ||
338 : | ia64p | 1.3 | |
339 : | |||
340 : | |||
341 : | LL=3 // LL = load latency | ||
342 : | ia64p | 1.4 | //if LL is changed, you'll also have to change the .pred.rel... parts below! |
343 : | ia64p | 1.3 | .prologue |
344 : | addl r14 = @ltoff(multipliers#), gp | ||
345 : | dep.z r15 = r34, 2, 32 | ||
346 : | .save ar.lc, r2 | ||
347 : | mov r2 = ar.lc | ||
348 : | ;; | ||
349 : | .body | ||
350 : | alloc r9=ar.pfs,0,24,0,24 | ||
351 : | mov r17 = ar.ec | ||
352 : | mov r10 = pr | ||
353 : | ld8 r14 = [r14] | ||
354 : | extr.u r16 = r34, 1, 16 //r16 = quant_d_2 | ||
355 : | dep.z r20 = r34, 1, 15 //r20 = quant_m_2 | ||
356 : | ;; | ||
357 : | add r15 = r15, r14 | ||
358 : | mov r21 = r16 //r21 = quant_d_2 | ||
359 : | mov r8 = r0 //r8 = sum = 0 | ||
360 : | mov pr.rot = 0 //p16-p63 = 0 | ||
361 : | ;; | ||
362 : | ld4 r15 = [r15] | ||
363 : | addl r14 = 63, r0 | ||
364 : | mov pr.rot = 1 << 16 //p16=1 | ||
365 : | ;; | ||
366 : | mov ar.lc = r14 | ||
367 : | mov ar.ec = LL+9 | ||
368 : | mov r29 = r15 | ||
369 : | ;; | ||
370 : | mov r15 = r33 //r15 = data | ||
371 : | mov r18 = r32 //r18 = coeff | ||
372 : | ;; | ||
373 : | |||
374 : | |||
375 : | .rotr ac1[LL+3], ac2[8], ac3[2] | ||
376 : | .rotp p[LL+9], cmp1[8], cmp1neg[8],cmp2[5], cmp2neg[2] | ||
377 : | |||
378 : | |||
379 : | |||
380 : | ia64p | 1.4 | //******************************************************************************* |
381 : | //* * | ||
382 : | //* for (i = 0; i < 64; i++) { * | ||
383 : | //* acL=acLevel = data[i]; * | ||
384 : | //* acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * | ||
385 : | //* if (acLevel < quant_m_2){ * | ||
386 : | //* acLevel = 0; * | ||
387 : | //* } * | ||
388 : | //* acLevel = (acLevel * mult) >> SCALEBITS; * | ||
389 : | //* sum += acLevel; * | ||
390 : | //* coeff[i] = ((acL < 0)?-acLevel:acLevel); * | ||
391 : | //* } * | ||
392 : | //* * | ||
393 : | //*******************************************************************************/ | ||
394 : | ia64p | 1.3 | |
395 : | |||
396 : | |||
397 : | .explicit | ||
398 : | .L58: | ||
399 : | ia64p | 1.4 | .pred.rel "clear", p29, p37 |
400 : | .pred.rel "mutex", p29, p37 | ||
401 : | ia64p | 1.5 | |
402 : | ia64p | 1.3 | //pipeline stage |
403 : | {.mmi | ||
404 : | ia64p | 1.5 | (p[0]) ld2 ac1[0] = [r15],2 // 0 acL=acLevel = data[i]; |
405 : | ia64p | 1.3 | (p[LL+1]) sub ac2[0] = r0, ac1[LL+1] // LL+1 ac2=-acLevel |
406 : | (p[LL]) sxt2 ac1[LL] = ac1[LL] // LL | ||
407 : | } | ||
408 : | ia64p | 1.5 | {.mmi |
409 : | (p[LL+1]) cmp4.le cmp1[0], cmp1neg[0] = r0, ac1[LL+1] // LL+1 cmp1 = (0<=acLevel) ; cmp1neg = !(0<=acLevel) | ||
410 : | (p[LL+4]) cmp4.le cmp2[0], cmp2neg[0] = r20, ac2[3] // LL+4 cmp2 = (quant_m_2 < acLevel) ; cmp2neg = !(quant_m_2 < acLevel) | ||
411 : | (cmp1[1]) sub ac2[1] = ac1[LL+2], r21 // LL+2 acLevel = acLevel - quant_d_2; | ||
412 : | ia64p | 1.3 | } |
413 : | {.mmi | ||
414 : | (cmp2neg[1]) mov ac2[4] = r0 // LL+5 if (acLevel < quant_m_2) acLevel=0; | ||
415 : | (cmp1neg[1]) sub ac2[1] = ac2[1], r21 // LL+2 acLevel = ac2 - quant_d_2; | ||
416 : | (p[LL+3]) sxt2 ac2[2] = ac2[2] // LL+3 | ||
417 : | ia64p | 1.5 | } |
418 : | ia64p | 1.3 | {.mmi |
419 : | ia64p | 1.4 | .pred.rel "mutex", p34, p42 |
420 : | ia64p | 1.3 | (cmp1[6]) mov ac3[0] = ac2[6] // LL+7 ac3 = acLevel; |
421 : | (cmp1neg[6]) sub ac3[0] = r0, ac2[6] // LL+7 ac3 = -acLevel; | ||
422 : | (p[LL+6]) pmpyshr2.u ac2[5] = r29, ac2[5], 16 // LL+6 acLevel = (acLevel * mult) >> SCALEBITS; | ||
423 : | } | ||
424 : | {.mib | ||
425 : | (p[LL+8]) st2 [r18] = ac3[1] , 2 // LL+8 coeff[i] = ac3; | ||
426 : | (cmp2[4]) add r8 = r8, ac2[7] // LL+8 sum += acLevel; | ||
427 : | br.ctop.sptk.few .L58 | ||
428 : | ;; | ||
429 : | } | ||
430 : | ia64p | 1.4 | |
431 : | .pred.rel "clear", p29, p37 | ||
432 : | ia64p | 1.3 | .default |
433 : | mov ar.ec = r17 | ||
434 : | ;; | ||
435 : | mov ar.lc = r2 | ||
436 : | mov pr = r10, -1 | ||
437 : | mov ar.pfs = r9 | ||
438 : | br.ret.sptk.many b0 | ||
439 : | edgomez | 1.6 | .endp quant_h263_inter_ia64# |
440 : | ia64p | 1.3 | |
441 : | |||
442 : | |||
443 : | |||
444 : | |||
445 : | |||
446 : | |||
447 : | edgomez | 1.6 | // void dequant_h263_inter_ia64(int16_t *data, const int16_t *coeff, const uint32_t quant) |
448 : | ia64p | 1.3 | |
449 : | edgomez | 1.6 | .common dequant_h263_inter#,8,8 |
450 : | ia64p | 1.3 | .align 16 |
451 : | edgomez | 1.6 | .global dequant_h263_inter_ia64# |
452 : | .proc dequant_h263_inter_ia64# | ||
453 : | dequant_h263_inter_ia64: | ||
454 : | ia64p | 1.3 | |
455 : | //*********************************************************************** | ||
456 : | ia64p | 1.4 | //* * |
457 : | //* const uint16_t quant_m_2 = quant << 1; * | ||
458 : | //* const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * | ||
459 : | //* uint32_t i; * | ||
460 : | //* * | ||
461 : | //*********************************************************************** | ||
462 : | ia64p | 1.3 | |
463 : | |||
464 : | |||
465 : | |||
466 : | .prologue | ||
467 : | andcm r14 = 1, r34 | ||
468 : | dep.z r29 = r34, 1, 15 | ||
469 : | alloc r9=ar.pfs,0,32,0,32 | ||
470 : | .save ar.lc, r2 | ||
471 : | mov r2 = ar.lc | ||
472 : | ;; | ||
473 : | .body | ||
474 : | sub r15 = r34, r14 // r15 = quant | ||
475 : | addl r14 = 63, r0 | ||
476 : | addl r21 = -2048, r0 | ||
477 : | addl r20 = 2047, r0 | ||
478 : | mov r16 = ar.ec | ||
479 : | mov r17 = pr | ||
480 : | ;; | ||
481 : | zxt2 r15 = r15 | ||
482 : | mov ar.lc = r14 | ||
483 : | mov pr.rot = 0 | ||
484 : | ;; | ||
485 : | adds r14 = 0, r33 // r14 = coeff | ||
486 : | mov r18 = r32 // r18 = data | ||
487 : | mov ar.ec = LL+10 | ||
488 : | mov pr.rot = 1 << 16 | ||
489 : | ;; | ||
490 : | |||
491 : | ia64p | 1.4 | //******************************************************************************* |
492 : | //* * | ||
493 : | //*for (i = 0; i < 64; i++) { * | ||
494 : | //* int16_t acLevel = coeff[i]; * | ||
495 : | //* * | ||
496 : | //* if (acLevel == 0) * | ||
497 : | //* { * | ||
498 : | //* data[i] = 0; * | ||
499 : | //* } * | ||
500 : | //* else if (acLevel < 0) * | ||
501 : | //* { * | ||
502 : | //* acLevel = acLevel * quant_m_2 - quant_add; * | ||
503 : | //* data[i] = (acLevel >= -2048 ? acLevel : -2048); * | ||
504 : | //* } * | ||
505 : | //* else // if (acLevel > 0) * | ||
506 : | //* { * | ||
507 : | //* acLevel = acLevel * quant_m_2 + quant_add; * | ||
508 : | //* data[i] = (acLevel <= 2047 ? acLevel : 2047); * | ||
509 : | //* } * | ||
510 : | //* } * | ||
511 : | //* * | ||
512 : | //*******************************************************************************/ | ||
513 : | ia64p | 1.3 | |
514 : | |||
515 : | |||
516 : | LL=2 // LL := load latency | ||
517 : | ia64p | 1.4 | //if LL is changed, you'll also have to change the .pred.rel... parts below! |
518 : | ia64p | 1.3 | |
519 : | |||
520 : | .rotr ac1[LL+10], x[5], y1[3], y2[3] | ||
521 : | .rotp p[LL+10] , cmp1neg[8], cmp2[5], cmp2neg[5],cmp3[2], cmp3neg[2] | ||
522 : | |||
523 : | .explicit | ||
524 : | //pipeline stage | ||
525 : | |||
526 : | .L60: | ||
527 : | ia64p | 1.4 | .pred.rel "clear", p36 |
528 : | .pred.rel "mutex", p47, p49 | ||
529 : | .pred.rel "mutex", p46, p48 | ||
530 : | .pred.rel "mutex", p40, p45 | ||
531 : | .pred.rel "mutex", p39, p44 | ||
532 : | .pred.rel "mutex", p38, p43 | ||
533 : | .pred.rel "mutex", p37, p42 | ||
534 : | .pred.rel "mutex", p36, p41 | ||
535 : | ia64p | 1.3 | {.mmi |
536 : | (p[0])ld2 ac1[0] = [r14] ,2 // 0 acLevel = coeff[i]; | ||
537 : | (p[LL+1])cmp4.ne p6, cmp1neg[0] = 0, ac1[LL+1] // LL+1 | ||
538 : | (p[LL])sxt2 ac1[LL] = ac1[LL] // LL | ||
539 : | |||
540 : | } | ||
541 : | {.mmi | ||
542 : | (p[LL+1])cmp4.le cmp2[0], cmp2neg[0] = r0, ac1[LL+1] // LL+1 | ||
543 : | (cmp2[1]) mov x[0] = r20 // LL+2 | ||
544 : | (p[LL+2])pmpyshr2.u ac1[LL+2] = r29, ac1[LL+2], 0 // LL+2 | ||
545 : | } | ||
546 : | {.mmi | ||
547 : | (cmp2neg[1]) mov x[0] = r21 // LL+2 | ||
548 : | (cmp2[2]) add ac1[LL+3] = ac1[LL+3], r15 // LL+3 | ||
549 : | (cmp2neg[2]) sub ac1[LL+3] = ac1[LL+3], r15 // LL+3 | ||
550 : | |||
551 : | } | ||
552 : | {.mmi | ||
553 : | (cmp2neg[4]) mov y1[0] = ac1[LL+5] // LL+5 | ||
554 : | (cmp2neg[4]) mov y2[0] = x[3] // LL+5 | ||
555 : | (p[LL+4])sxt2 ac1[LL+4] = ac1[LL+4] // LL+4 | ||
556 : | } | ||
557 : | {.mmi | ||
558 : | ia64p | 1.4 | (cmp2[4]) mov y1[0] = x[3] // LL+5 |
559 : | (cmp2[4]) mov y2[0] = ac1[LL+5] // LL+5 | ||
560 : | ia64p | 1.3 | (p[LL+6])cmp4.le cmp3[0], cmp3neg[0] = x[4], ac1[LL+6] // LL+6 |
561 : | } | ||
562 : | {.mmi | ||
563 : | (cmp3[1]) mov ac1[LL+7] = y1[2] // LL+7 | ||
564 : | (cmp3neg[1]) mov ac1[LL+7] = y2[2] // LL+7 | ||
565 : | (cmp1neg[7]) mov ac1[LL+8] = r0 // LL+8 | ||
566 : | } | ||
567 : | {.mbb | ||
568 : | (p[LL+9])st2 [r18] = ac1[LL+9] ,2 // LL+9 | ||
569 : | nop.b 0x0 | ||
570 : | br.ctop.sptk.few .L60 | ||
571 : | ;; | ||
572 : | } | ||
573 : | ia64p | 1.4 | .pred.rel "clear", p36 |
574 : | ia64p | 1.3 | .default |
575 : | mov ar.lc = r2 | ||
576 : | mov ar.pfs = r9 | ||
577 : | mov ar.ec = r16 | ||
578 : | mov pr = r17, -1 | ||
579 : | ;; | ||
580 : | mov ar.lc = r2 | ||
581 : | br.ret.sptk.many b0 | ||
582 : | edgomez | 1.6 | .endp dequant_h263_inter_ia64# |
583 : | ia64p | 1.3 | .ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" |
No admin address has been configured | ViewVC Help |
Powered by ViewVC 1.0.4 |