1 |
.file "quant_h263.c" |
// **************************************************************************** |
2 |
|
// * |
3 |
|
// * XVID MPEG-4 VIDEO CODEC |
4 |
|
// * - IA64 h.263 quantization - |
5 |
|
// * |
6 |
|
// * Copyright(C) 2002 Christian Engel, Hans-Joachim Daniels |
7 |
|
// * |
8 |
|
// * This program is free software; you can redistribute it and/or modify it |
9 |
|
// * under the terms of the GNU General Public License as published by |
10 |
|
// * the Free Software Foundation; either version 2 of the License, or |
11 |
|
// * (at your option) any later version. |
12 |
|
// * |
13 |
|
// * This program is distributed in the hope that it will be useful, |
14 |
|
// * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
|
// * GNU General Public License for more details. |
17 |
|
// * |
18 |
|
// * You should have received a copy of the GNU General Public License |
19 |
|
// * along with this program; if not, write to the Free Software |
20 |
|
// * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
21 |
|
// * |
22 |
|
// * $Id$ |
23 |
|
// * |
24 |
|
// ***************************************************************************/ |
25 |
|
// |
26 |
|
// **************************************************************************** |
27 |
|
// * |
28 |
|
// * quant_h263_ia64.s, IA-64 h.263 quantization |
29 |
|
// * |
30 |
|
// * This version was implemented during an IA-64 practical training at |
31 |
|
// * the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/) |
32 |
|
// * |
33 |
|
// **************************************************************************** |
34 |
|
|
35 |
|
// ***************************************************************************** |
36 |
|
// * |
37 |
|
// * functions quant_inter and dequant_inter have been softwarepipelined |
38 |
|
// * use was made of the pmpyshr2 instruction |
39 |
|
// * |
40 |
|
// * by Christian Engel and Hans-Joachim Daniels |
41 |
|
// * christian.engel@ira.uka.de hans-joachim.daniels@ira.uka.de |
42 |
|
// * |
43 |
|
// * This was made for the ia64 DivX laboratory (yes, it was really called |
44 |
|
// * this way, originally OpenDivX was intendet, but died shortly before our |
45 |
|
// * work started (you will probably already know ...)) |
46 |
|
// * at the Universitat Karlsruhe (TH) held between April and July 2002 |
47 |
|
// * http://www.info.uni-karlsruhe.de/~rubino/ia64p/ |
48 |
|
// * |
49 |
|
// *****************************************************************************/ |
50 |
|
|
51 |
|
.file "quant_h263_ia64.s" |
52 |
.pred.safe_across_calls p1-p5,p16-p63 |
.pred.safe_across_calls p1-p5,p16-p63 |
53 |
.section .rodata |
.section .rodata |
54 |
.align 4 |
.align 4 |
90 |
.global __divdi3# |
.global __divdi3# |
91 |
.text |
.text |
92 |
.align 16 |
.align 16 |
93 |
.global quant_intra_ia64# |
.global quant_h263_intra_ia64# |
94 |
.proc quant_intra_ia64# |
.proc quant_h263_intra_ia64# |
95 |
quant_intra_ia64: |
quant_h263_intra_ia64: |
96 |
.prologue //12, 37 |
.prologue |
97 |
.save ar.pfs, r38 |
.save ar.pfs, r38 |
98 |
alloc r38 = ar.pfs, 4, 3, 2, 0 |
alloc r38 = ar.pfs, 4, 3, 2, 0 |
99 |
adds r16 = -8, r12 |
adds r16 = -8, r12 |
122 |
ld4 r16 = [r16] |
ld4 r16 = [r16] |
123 |
;; |
;; |
124 |
setf.sig f2 = r16 |
setf.sig f2 = r16 |
125 |
(p6) br.cond.dptk .L4 |
(p6) br.cond.dptk .L8 |
126 |
extr r39 = r35, 1, 31 |
extr r39 = r35, 1, 31 |
127 |
sxt4 r40 = r35 |
sxt4 r40 = r35 |
128 |
;; |
;; |
129 |
add r39 = r39, r15 |
add r39 = r39, r15 |
130 |
br .L38 |
br .L21 |
131 |
;; |
;; |
132 |
.L4: |
.L8: |
133 |
extr r39 = r35, 1, 31 |
extr r39 = r35, 1, 31 |
134 |
sxt4 r40 = r35 |
sxt4 r40 = r35 |
135 |
;; |
;; |
136 |
sub r39 = r15, r39 |
sub r39 = r15, r39 |
137 |
;; |
;; |
138 |
.L38: |
.L21: |
139 |
sxt4 r39 = r39 |
sxt4 r39 = r39 |
140 |
br.call.sptk.many b0 = __divdi3# |
br.call.sptk.many b0 = __divdi3# |
141 |
;; |
;; |
142 |
addl r16 = 2, r0 |
addl r14 = 62, r0 |
143 |
st2 [r32] = r8 |
st2 [r32] = r8 |
144 |
addl r17 = 1, r0 |
addl r19 = 1, r0 |
|
;; |
|
|
add r14 = r33, r16 |
|
|
;; |
|
|
ld2 r15 = [r14] |
|
|
;; |
|
|
sxt2 r15 = r15 |
|
|
;; |
|
|
mov r14 = r15 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L21 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r32, r16 |
|
|
(p6) add r15 = r32, r16 |
|
|
(p6) setf.sig f6 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r14 = r0, r14 |
|
|
br .L39 |
|
|
;; |
|
|
.L21: |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r32, r16 |
|
|
(p6) setf.sig f6 = r15 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
(p6) add r15 = r32, r16 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
.L39: |
|
|
//.pred.rel.mutex p6, p7 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
adds r17 = 1, r17 |
|
|
;; |
|
|
cmp4.geu p6, p7 = 63, r17 |
|
|
(p7) br.cond.dptk .L16 |
|
|
addl r14 = 30, r0 |
|
145 |
;; |
;; |
146 |
mov ar.lc = r14 |
mov ar.lc = r14 |
147 |
;; |
;; |
148 |
.L37: |
.L20: |
149 |
dep.z r16 = r17, 1, 32 |
dep.z r17 = r19, 1, 32 |
|
;; |
|
|
add r14 = r16, r33 |
|
|
;; |
|
|
ld2 r15 = [r14] |
|
150 |
;; |
;; |
151 |
sxt2 r15 = r15 |
add r15 = r17, r33 |
152 |
;; |
adds r19 = 1, r19 |
|
mov r14 = r15 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L27 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) add r15 = r16, r32 |
|
|
(p6) setf.sig f6 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r14 = r0, r14 |
|
|
br .L40 |
|
|
;; |
|
|
.L27: |
|
|
cmp4.le p6, p7 = r36, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f6 = r15 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f6 = f6, f2, f0 |
|
|
(p6) add r15 = r16, r32 |
|
|
;; |
|
|
(p6) getf.sig r14 = f6 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
.L40: |
|
|
//.pred.rel.mutex p6, p7 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
adds r14 = 1, r17 |
|
|
;; |
|
|
dep.z r16 = r14, 1, 32 |
|
|
;; |
|
|
add r15 = r16, r33 |
|
153 |
;; |
;; |
154 |
ld2 r14 = [r15] |
ld2 r14 = [r15] |
155 |
;; |
;; |
156 |
sxt2 r14 = r14 |
sxt2 r14 = r14 |
157 |
;; |
;; |
158 |
mov r15 = r14 |
mov r16 = r14 |
159 |
;; |
mov r18 = r14 |
|
cmp4.le p6, p7 = r0, r15 |
|
|
(p6) br.cond.dptk .L33 |
|
|
sub r14 = r0, r15 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r15 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r36, r15 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f6 = r15 |
|
160 |
;; |
;; |
161 |
(p7) st2 [r14] = r0 |
sub r15 = r0, r16 |
162 |
(p6) xma.l f6 = f6, f2, f0 |
cmp4.le p8, p9 = r36, r16 |
163 |
(p6) add r15 = r16, r32 |
cmp4.le p6, p7 = r0, r16 |
164 |
;; |
;; |
165 |
(p6) getf.sig r14 = f6 |
sxt2 r14 = r15 |
166 |
|
(p6) br.cond.dptk .L14 |
167 |
;; |
;; |
168 |
(p6) extr r14 = r14, 16, 16 |
mov r16 = r14 |
169 |
|
add r18 = r17, r32 |
170 |
;; |
;; |
171 |
(p6) sub r14 = r0, r14 |
setf.sig f6 = r16 |
172 |
br .L41 |
cmp4.le p6, p7 = r36, r16 |
173 |
.L33: |
mov r15 = r18 |
|
cmp4.le p6, p7 = r36, r15 |
|
174 |
;; |
;; |
175 |
(p7) add r14 = r16, r32 |
xma.l f6 = f6, f2, f0 |
176 |
(p6) add r15 = r16, r32 |
(p7) st2 [r18] = r0 |
|
(p6) setf.sig f6 = r14 |
|
177 |
;; |
;; |
178 |
(p7) st2 [r14] = r0 |
getf.sig r14 = f6 |
|
(p6) xma.l f6 = f6, f2, f0 |
|
179 |
;; |
;; |
180 |
(p6) getf.sig r14 = f6 |
extr r14 = r14, 16, 16 |
181 |
;; |
;; |
182 |
(p6) extr r14 = r14, 16, 16 |
sub r14 = r0, r14 |
|
.L41: |
|
|
//.pred.rel.mutex p6, p7 |
|
183 |
;; |
;; |
184 |
(p6) st2 [r15] = r14 |
(p6) st2 [r15] = r14 |
185 |
adds r17 = 2, r17 |
br .L12 |
186 |
br.cloop.sptk.few .L37 |
.L14: |
187 |
.L16: |
.pred.rel "mutex", p8, p9 |
188 |
|
setf.sig f6 = r18 |
189 |
|
add r16 = r17, r32 |
190 |
|
;; |
191 |
|
xma.l f6 = f6, f2, f0 |
192 |
|
mov r15 = r16 |
193 |
|
(p9) st2 [r16] = r0 |
194 |
|
;; |
195 |
|
getf.sig r14 = f6 |
196 |
|
;; |
197 |
|
extr r14 = r14, 16, 16 |
198 |
|
;; |
199 |
|
(p8) st2 [r15] = r14 |
200 |
|
.L12: |
201 |
|
br.cloop.sptk.few .L20 |
202 |
adds r18 = 24, r12 |
adds r18 = 24, r12 |
203 |
;; |
;; |
204 |
ld8 r19 = [r18], 8 |
ld8 r19 = [r18], 8 |
210 |
.restore sp |
.restore sp |
211 |
adds r12 = 32, r12 |
adds r12 = 32, r12 |
212 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
213 |
.endp quant_intra_ia64# |
.endp quant_h263_intra_ia64# |
214 |
.align 16 |
.common quant_h263_intra#,8,8 |
215 |
.global quant_inter_ia64# |
.common dequant_h263_intra#,8,8 |
|
.proc quant_inter_ia64# |
|
|
quant_inter_ia64: |
|
|
.prologue |
|
|
addl r14 = @ltoff(multipliers#), gp |
|
|
dep.z r15 = r34, 2, 32 |
|
|
.save ar.lc, r2 |
|
|
mov r2 = ar.lc |
|
|
;; |
|
|
.body |
|
|
ld8 r14 = [r14] |
|
|
extr.u r16 = r34, 1, 16 |
|
|
dep.z r17 = r34, 1, 15 |
|
|
;; |
|
|
add r15 = r15, r14 |
|
|
mov r18 = r16 |
|
|
mov r8 = r0 |
|
|
;; |
|
|
ld4 r15 = [r15] |
|
|
addl r14 = 31, r0 |
|
|
mov r19 = r0 |
|
|
;; |
|
|
setf.sig f6 = r15 |
|
|
mov ar.lc = r14 |
|
|
;; |
|
|
.L65: |
|
|
dep.z r16 = r19, 1, 32 |
|
|
;; |
|
|
add r14 = r16, r33 |
|
|
;; |
|
|
ld2 r15 = [r14] |
|
|
;; |
|
|
sxt2 r15 = r15 |
|
|
;; |
|
|
mov r14 = r15 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L55 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) add r16 = r16, r32 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r15 = r0, r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
;; |
|
|
(p6) st2 [r16] = r15 |
|
|
br .L53 |
|
|
.L55: |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) add r15 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
.L53: |
|
|
adds r14 = 1, r19 |
|
|
;; |
|
|
dep.z r16 = r14, 1, 32 |
|
|
;; |
|
|
add r15 = r16, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r0, r14 |
|
|
(p6) br.cond.dptk .L61 |
|
|
sub r14 = r0, r14 |
|
|
;; |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) add r16 = r16, r32 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) sub r15 = r0, r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
;; |
|
|
(p6) st2 [r16] = r15 |
|
|
br .L59 |
|
|
.L61: |
|
|
sub r14 = r14, r18 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p7) add r14 = r16, r32 |
|
|
(p6) add r15 = r16, r32 |
|
|
(p6) setf.sig f7 = r14 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p6) xma.l f7 = f7, f6, f0 |
|
|
;; |
|
|
(p6) getf.sig r14 = f7 |
|
|
;; |
|
|
(p6) extr r14 = r14, 16, 16 |
|
|
;; |
|
|
(p6) st2 [r15] = r14 |
|
|
(p6) add r8 = r8, r14 |
|
|
.L59: |
|
|
adds r19 = 2, r19 |
|
|
br.cloop.sptk.few .L65 |
|
|
;; |
|
|
mov ar.lc = r2 |
|
|
br.ret.sptk.many b0 |
|
|
.endp quant_inter_ia64# |
|
|
.common quant_intra#,8,8 |
|
|
.common dequant_intra#,8,8 |
|
216 |
.align 16 |
.align 16 |
217 |
.global dequant_intra_ia64# |
.global dequant_h263_intra_ia64# |
218 |
.proc dequant_intra_ia64# |
.proc dequant_h263_intra_ia64# |
219 |
dequant_intra_ia64: |
dequant_h263_intra_ia64: |
220 |
.prologue |
.prologue |
221 |
ld2 r14 = [r33] |
ld2 r14 = [r33] |
222 |
andcm r15 = 1, r34 |
andcm r15 = 1, r34 |
244 |
cmp4.le p6, p7 = r16, r15 |
cmp4.le p6, p7 = r16, r15 |
245 |
;; |
;; |
246 |
(p7) st2 [r32] = r16 |
(p7) st2 [r32] = r16 |
247 |
(p7) br.cond.dptk .L68 |
(p7) br.cond.dptk .L32 |
248 |
addl r14 = 2047, r0 |
addl r14 = 2047, r0 |
249 |
;; |
;; |
250 |
cmp4.ge p6, p7 = r14, r15 |
cmp4.ge p6, p7 = r14, r15 |
251 |
;; |
;; |
252 |
(p7) st2 [r32] = r14 |
(p7) st2 [r32] = r14 |
253 |
.L68: |
.L32: |
254 |
addl r14 = 20, r0 |
addl r14 = 62, r0 |
255 |
addl r19 = 1, r0 |
addl r19 = 1, r0 |
256 |
addl r21 = 2048, r0 |
addl r22 = 2048, r0 |
257 |
addl r20 = -2048, r0 |
addl r21 = -2048, r0 |
258 |
addl r18 = 2047, r0 |
addl r20 = 2047, r0 |
259 |
;; |
;; |
260 |
mov ar.lc = r14 |
mov ar.lc = r14 |
261 |
;; |
;; |
262 |
.L110: |
.L56: |
263 |
dep.z r16 = r19, 1, 32 |
dep.z r16 = r19, 1, 32 |
264 |
;; |
;; |
265 |
add r14 = r16, r33 |
add r14 = r16, r33 |
266 |
|
add r17 = r16, r32 |
267 |
|
adds r19 = 1, r19 |
268 |
;; |
;; |
269 |
ld2 r15 = [r14] |
ld2 r15 = [r14] |
270 |
;; |
;; |
271 |
sxt2 r15 = r15 |
sxt2 r15 = r15 |
272 |
;; |
;; |
273 |
cmp4.ne p6, p7 = 0, r15 |
cmp4.ne p6, p7 = 0, r15 |
274 |
|
cmp4.le p8, p9 = r0, r15 |
275 |
;; |
;; |
276 |
(p7) add r14 = r16, r32 |
(p7) st2 [r17] = r0 |
277 |
;; |
(p7) br.cond.dpnt .L36 |
278 |
(p7) st2 [r14] = r0 |
add r18 = r16, r32 |
279 |
(p7) br.cond.dpnt .L92 |
sub r17 = r0, r15 |
280 |
cmp4.le p6, p7 = r0, r15 |
;; |
281 |
(p6) br.cond.dptk .L95 |
mov r14 = r18 |
282 |
sub r14 = r0, r15 |
(p8) br.cond.dptk .L40 |
283 |
add r17 = r16, r32 |
setf.sig f8 = r17 |
|
;; |
|
|
setf.sig f8 = r14 |
|
284 |
;; |
;; |
285 |
xma.l f8 = f6, f8, f7 |
xma.l f8 = f6, f8, f7 |
286 |
;; |
;; |
287 |
getf.sig r15 = f8 |
getf.sig r15 = f8 |
288 |
;; |
;; |
289 |
cmp4.lt p6, p7 = r21, r15 |
cmp4.lt p6, p7 = r22, r15 |
290 |
;; |
sub r16 = r0, r15 |
|
(p7) sub r14 = r0, r15 |
|
291 |
;; |
;; |
292 |
(p7) st2 [r17] = r14 |
(p7) st2 [r14] = r16 |
293 |
(p6) st2 [r17] = r20 |
(p6) st2 [r14] = r21 |
294 |
br .L92 |
br .L36 |
295 |
.L95: |
.L40: |
296 |
setf.sig f8 = r15 |
setf.sig f8 = r15 |
|
add r14 = r16, r32 |
|
297 |
;; |
;; |
298 |
xma.l f8 = f6, f8, f7 |
xma.l f8 = f6, f8, f7 |
299 |
;; |
;; |
300 |
getf.sig r15 = f8 |
getf.sig r15 = f8 |
301 |
;; |
;; |
302 |
cmp4.le p6, p7 = r18, r15 |
cmp4.le p6, p7 = r20, r15 |
|
;; |
|
|
(p6) mov r15 = r18 |
|
303 |
;; |
;; |
304 |
st2 [r14] = r15 |
(p6) mov r14 = r20 |
305 |
.L92: |
(p7) mov r14 = r15 |
|
adds r14 = 1, r19 |
|
|
;; |
|
|
dep.z r17 = r14, 1, 32 |
|
|
;; |
|
|
add r15 = r17, r33 |
|
|
;; |
|
|
ld2 r14 = [r15] |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
mov r16 = r14 |
|
|
;; |
|
|
cmp4.ne p6, p7 = 0, r16 |
|
|
;; |
|
|
(p7) add r14 = r17, r32 |
|
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p7) br.cond.dpnt .L98 |
|
|
cmp4.le p6, p7 = r0, r16 |
|
|
(p6) br.cond.dptk .L101 |
|
|
sub r14 = r0, r16 |
|
|
add r17 = r17, r32 |
|
|
;; |
|
|
setf.sig f8 = r14 |
|
|
;; |
|
|
xma.l f8 = f6, f8, f7 |
|
306 |
;; |
;; |
307 |
getf.sig r16 = f8 |
st2 [r18] = r14 |
308 |
|
.L36: |
309 |
|
br.cloop.sptk.few .L56 |
310 |
;; |
;; |
311 |
cmp4.lt p6, p7 = r21, r16 |
mov ar.lc = r2 |
312 |
;; |
br.ret.sptk.many b0 |
313 |
(p7) sub r14 = r0, r16 |
.endp dequant_h263_intra_ia64# |
314 |
;; |
|
315 |
(p7) st2 [r17] = r14 |
|
316 |
(p6) st2 [r17] = r20 |
|
317 |
br .L98 |
// uint32_t quant_h263_inter_ia64(int16_t *coeff, const int16_t *data, const uint32_t quant) |
318 |
.L101: |
|
319 |
setf.sig f8 = r16 |
|
320 |
add r14 = r17, r32 |
|
321 |
;; |
.common quant_h263_inter#,8,8 |
322 |
xma.l f8 = f6, f8, f7 |
.align 16 |
323 |
;; |
.global quant_h263_inter_ia64# |
324 |
getf.sig r16 = f8 |
.proc quant_h263_inter_ia64# |
325 |
;; |
quant_h263_inter_ia64: |
326 |
cmp4.le p6, p7 = r18, r16 |
|
327 |
;; |
|
328 |
(p6) mov r15 = r18 |
//******************************************************* |
329 |
(p7) mov r15 = r16 |
//* * |
330 |
;; |
//* const uint32_t mult = multipliers[quant]; * |
331 |
st2 [r14] = r15 |
//* const uint16_t quant_m_2 = quant << 1; * |
332 |
.L98: |
//* const uint16_t quant_d_2 = quant >> 1; * |
333 |
adds r14 = 2, r19 |
//* int sum = 0; * |
334 |
;; |
//* uint32_t i; * |
335 |
dep.z r17 = r14, 1, 32 |
//* int16_t acLevel,acL; * |
336 |
;; |
//* * |
337 |
add r15 = r17, r33 |
//*******************************************************/ |
338 |
;; |
|
339 |
ld2 r14 = [r15] |
|
340 |
;; |
|
341 |
sxt2 r14 = r14 |
LL=3 // LL = load latency |
342 |
;; |
//if LL is changed, you'll also have to change the .pred.rel... parts below! |
343 |
mov r16 = r14 |
.prologue |
344 |
;; |
addl r14 = @ltoff(multipliers#), gp |
345 |
cmp4.ne p6, p7 = 0, r16 |
dep.z r15 = r34, 2, 32 |
346 |
;; |
.save ar.lc, r2 |
347 |
(p7) add r14 = r17, r32 |
mov r2 = ar.lc |
|
;; |
|
|
(p7) st2 [r14] = r0 |
|
|
(p7) br.cond.dpnt .L104 |
|
|
cmp4.le p6, p7 = r0, r16 |
|
|
(p6) br.cond.dptk .L107 |
|
|
sub r14 = r0, r16 |
|
|
add r17 = r17, r32 |
|
|
;; |
|
|
setf.sig f8 = r14 |
|
|
;; |
|
|
xma.l f8 = f6, f8, f7 |
|
|
;; |
|
|
getf.sig r16 = f8 |
|
|
;; |
|
|
cmp4.lt p6, p7 = r21, r16 |
|
|
;; |
|
|
(p7) sub r14 = r0, r16 |
|
348 |
;; |
;; |
349 |
(p7) st2 [r17] = r14 |
.body |
350 |
(p6) st2 [r17] = r20 |
alloc r9=ar.pfs,0,24,0,24 |
351 |
br .L104 |
mov r17 = ar.ec |
352 |
.L107: |
mov r10 = pr |
353 |
setf.sig f8 = r16 |
ld8 r14 = [r14] |
354 |
add r14 = r17, r32 |
extr.u r16 = r34, 1, 16 //r16 = quant_d_2 |
355 |
|
dep.z r20 = r34, 1, 15 //r20 = quant_m_2 |
356 |
;; |
;; |
357 |
xma.l f8 = f6, f8, f7 |
add r15 = r15, r14 |
358 |
|
mov r21 = r16 //r21 = quant_d_2 |
359 |
|
mov r8 = r0 //r8 = sum = 0 |
360 |
|
mov pr.rot = 0 //p16-p63 = 0 |
361 |
;; |
;; |
362 |
getf.sig r16 = f8 |
ld4 r15 = [r15] |
363 |
|
addl r14 = 63, r0 |
364 |
|
mov pr.rot = 1 << 16 //p16=1 |
365 |
;; |
;; |
366 |
cmp4.le p6, p7 = r18, r16 |
mov ar.lc = r14 |
367 |
|
mov ar.ec = LL+9 |
368 |
|
mov r29 = r15 |
369 |
;; |
;; |
370 |
(p6) mov r15 = r18 |
mov r15 = r33 //r15 = data |
371 |
(p7) mov r15 = r16 |
mov r18 = r32 //r18 = coeff |
372 |
;; |
;; |
373 |
st2 [r14] = r15 |
|
374 |
.L104: |
|
375 |
adds r19 = 3, r19 |
.rotr ac1[LL+3], ac2[8], ac3[2] |
376 |
br.cloop.sptk.few .L110 |
.rotp p[LL+9], cmp1[8], cmp1neg[8],cmp2[5], cmp2neg[2] |
377 |
|
|
378 |
|
|
379 |
|
|
380 |
|
//******************************************************************************* |
381 |
|
//* * |
382 |
|
//* for (i = 0; i < 64; i++) { * |
383 |
|
//* acL=acLevel = data[i]; * |
384 |
|
//* acLevel = ((acLevel < 0)?-acLevel:acLevel) - quant_d_2; * |
385 |
|
//* if (acLevel < quant_m_2){ * |
386 |
|
//* acLevel = 0; * |
387 |
|
//* } * |
388 |
|
//* acLevel = (acLevel * mult) >> SCALEBITS; * |
389 |
|
//* sum += acLevel; * |
390 |
|
//* coeff[i] = ((acL < 0)?-acLevel:acLevel); * |
391 |
|
//* } * |
392 |
|
//* * |
393 |
|
//*******************************************************************************/ |
394 |
|
|
395 |
|
|
396 |
|
|
397 |
|
.explicit |
398 |
|
.L58: |
399 |
|
.pred.rel "clear", p29, p37 |
400 |
|
.pred.rel "mutex", p29, p37 |
401 |
|
|
402 |
|
//pipeline stage |
403 |
|
{.mmi |
404 |
|
(p[0]) ld2 ac1[0] = [r15],2 // 0 acL=acLevel = data[i]; |
405 |
|
(p[LL+1]) sub ac2[0] = r0, ac1[LL+1] // LL+1 ac2=-acLevel |
406 |
|
(p[LL]) sxt2 ac1[LL] = ac1[LL] // LL |
407 |
|
} |
408 |
|
{.mmi |
409 |
|
(p[LL+1]) cmp4.le cmp1[0], cmp1neg[0] = r0, ac1[LL+1] // LL+1 cmp1 = (0<=acLevel) ; cmp1neg = !(0<=acLevel) |
410 |
|
(p[LL+4]) cmp4.le cmp2[0], cmp2neg[0] = r20, ac2[3] // LL+4 cmp2 = (quant_m_2 < acLevel) ; cmp2neg = !(quant_m_2 < acLevel) |
411 |
|
(cmp1[1]) sub ac2[1] = ac1[LL+2], r21 // LL+2 acLevel = acLevel - quant_d_2; |
412 |
|
} |
413 |
|
{.mmi |
414 |
|
(cmp2neg[1]) mov ac2[4] = r0 // LL+5 if (acLevel < quant_m_2) acLevel=0; |
415 |
|
(cmp1neg[1]) sub ac2[1] = ac2[1], r21 // LL+2 acLevel = ac2 - quant_d_2; |
416 |
|
(p[LL+3]) sxt2 ac2[2] = ac2[2] // LL+3 |
417 |
|
} |
418 |
|
{.mmi |
419 |
|
.pred.rel "mutex", p34, p42 |
420 |
|
(cmp1[6]) mov ac3[0] = ac2[6] // LL+7 ac3 = acLevel; |
421 |
|
(cmp1neg[6]) sub ac3[0] = r0, ac2[6] // LL+7 ac3 = -acLevel; |
422 |
|
(p[LL+6]) pmpyshr2.u ac2[5] = r29, ac2[5], 16 // LL+6 acLevel = (acLevel * mult) >> SCALEBITS; |
423 |
|
} |
424 |
|
{.mib |
425 |
|
(p[LL+8]) st2 [r18] = ac3[1] , 2 // LL+8 coeff[i] = ac3; |
426 |
|
(cmp2[4]) add r8 = r8, ac2[7] // LL+8 sum += acLevel; |
427 |
|
br.ctop.sptk.few .L58 |
428 |
|
;; |
429 |
|
} |
430 |
|
|
431 |
|
.pred.rel "clear", p29, p37 |
432 |
|
.default |
433 |
|
mov ar.ec = r17 |
434 |
;; |
;; |
435 |
mov ar.lc = r2 |
mov ar.lc = r2 |
436 |
|
mov pr = r10, -1 |
437 |
|
mov ar.pfs = r9 |
438 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
439 |
.endp dequant_intra_ia64# |
.endp quant_h263_inter_ia64# |
440 |
.common quant_inter#,8,8 |
|
441 |
.common dequant_inter#,8,8 |
|
442 |
|
|
443 |
|
|
444 |
|
|
445 |
|
|
446 |
|
|
447 |
|
// void dequant_h263_inter_ia64(int16_t *data, const int16_t *coeff, const uint32_t quant) |
448 |
|
|
449 |
|
.common dequant_h263_inter#,8,8 |
450 |
.align 16 |
.align 16 |
451 |
.global dequant_inter_ia64# |
.global dequant_h263_inter_ia64# |
452 |
.proc dequant_inter_ia64# |
.proc dequant_h263_inter_ia64# |
453 |
dequant_inter_ia64: |
dequant_h263_inter_ia64: |
454 |
|
|
455 |
|
//*********************************************************************** |
456 |
|
//* * |
457 |
|
//* const uint16_t quant_m_2 = quant << 1; * |
458 |
|
//* const uint16_t quant_add = (quant & 1 ? quant : quant - 1); * |
459 |
|
//* uint32_t i; * |
460 |
|
//* * |
461 |
|
//*********************************************************************** |
462 |
|
|
463 |
|
|
464 |
|
|
465 |
|
|
466 |
.prologue |
.prologue |
467 |
andcm r14 = 1, r34 |
andcm r14 = 1, r34 |
468 |
dep.z r15 = r34, 1, 15 |
dep.z r29 = r34, 1, 15 |
469 |
|
alloc r9=ar.pfs,0,32,0,32 |
470 |
.save ar.lc, r2 |
.save ar.lc, r2 |
471 |
mov r2 = ar.lc |
mov r2 = ar.lc |
472 |
;; |
;; |
473 |
.body |
.body |
474 |
sub r34 = r34, r14 |
sub r15 = r34, r14 // r15 = quant |
475 |
setf.sig f6 = r15 |
addl r14 = 63, r0 |
476 |
mov r19 = r0 |
addl r21 = -2048, r0 |
477 |
addl r14 = 31, r0 |
addl r20 = 2047, r0 |
478 |
addl r18 = -2048, r0 |
mov r16 = ar.ec |
479 |
addl r17 = 2047, r0 |
mov r17 = pr |
480 |
;; |
;; |
481 |
zxt2 r34 = r34 |
zxt2 r15 = r15 |
482 |
mov ar.lc = r14 |
mov ar.lc = r14 |
483 |
|
mov pr.rot = 0 |
484 |
;; |
;; |
485 |
.L122: |
adds r14 = 0, r33 // r14 = coeff |
486 |
dep.z r16 = r19, 1, 32 |
mov r18 = r32 // r18 = data |
487 |
;; |
mov ar.ec = LL+10 |
488 |
add r14 = r16, r33 |
mov pr.rot = 1 << 16 |
489 |
;; |
;; |
490 |
ld2 r15 = [r14] |
|
491 |
;; |
//******************************************************************************* |
492 |
sxt2 r15 = r15 |
//* * |
493 |
;; |
//*for (i = 0; i < 64; i++) { * |
494 |
mov r14 = r15 |
//* int16_t acLevel = coeff[i]; * |
495 |
;; |
//* * |
496 |
cmp4.ne p6, p7 = 0, r14 |
//* if (acLevel == 0) * |
497 |
;; |
//* { * |
498 |
(p7) add r14 = r16, r32 |
//* data[i] = 0; * |
499 |
;; |
//* } * |
500 |
(p7) st2 [r14] = r0 |
//* else if (acLevel < 0) * |
501 |
(p7) br.cond.dpnt .L112 |
//* { * |
502 |
cmp4.le p6, p7 = r0, r14 |
//* acLevel = acLevel * quant_m_2 - quant_add; * |
503 |
(p6) br.cond.dptk .L115 |
//* data[i] = (acLevel >= -2048 ? acLevel : -2048); * |
504 |
setf.sig f7 = r14 |
//* } * |
505 |
add r15 = r16, r32 |
//* else // if (acLevel > 0) * |
506 |
;; |
//* { * |
507 |
xma.l f7 = f7, f6, f0 |
//* acLevel = acLevel * quant_m_2 + quant_add; * |
508 |
;; |
//* data[i] = (acLevel <= 2047 ? acLevel : 2047); * |
509 |
getf.sig r14 = f7 |
//* } * |
510 |
;; |
//* } * |
511 |
sub r14 = r14, r34 |
//* * |
512 |
;; |
//*******************************************************************************/ |
513 |
sxt2 r14 = r14 |
|
514 |
;; |
|
515 |
cmp4.le p6, p7 = r18, r14 |
|
516 |
;; |
LL=2 // LL := load latency |
517 |
(p7) mov r14 = r18 |
//if LL is changed, you'll also have to change the .pred.rel... parts below! |
518 |
br .L123 |
|
519 |
.L115: |
|
520 |
setf.sig f8 = r15 |
.rotr ac1[LL+10], x[5], y1[3], y2[3] |
521 |
setf.sig f7 = r34 |
.rotp p[LL+10] , cmp1neg[8], cmp2[5], cmp2neg[5],cmp3[2], cmp3neg[2] |
522 |
;; |
|
523 |
xma.l f8 = f8, f6, f7 |
.explicit |
524 |
add r15 = r16, r32 |
//pipeline stage |
525 |
;; |
|
526 |
getf.sig r14 = f8 |
.L60: |
527 |
;; |
.pred.rel "clear", p36 |
528 |
sxt2 r14 = r14 |
.pred.rel "mutex", p47, p49 |
529 |
;; |
.pred.rel "mutex", p46, p48 |
530 |
cmp4.le p6, p7 = r17, r14 |
.pred.rel "mutex", p40, p45 |
531 |
;; |
.pred.rel "mutex", p39, p44 |
532 |
(p6) mov r14 = r17 |
.pred.rel "mutex", p38, p43 |
533 |
;; |
.pred.rel "mutex", p37, p42 |
534 |
.L123: |
.pred.rel "mutex", p36, p41 |
535 |
st2 [r15] = r14 |
{.mmi |
536 |
.L112: |
(p[0])ld2 ac1[0] = [r14] ,2 // 0 acLevel = coeff[i]; |
537 |
adds r14 = 1, r19 |
(p[LL+1])cmp4.ne p6, cmp1neg[0] = 0, ac1[LL+1] // LL+1 |
538 |
;; |
(p[LL])sxt2 ac1[LL] = ac1[LL] // LL |
539 |
dep.z r16 = r14, 1, 32 |
|
540 |
;; |
} |
541 |
add r15 = r16, r33 |
{.mmi |
542 |
;; |
(p[LL+1])cmp4.le cmp2[0], cmp2neg[0] = r0, ac1[LL+1] // LL+1 |
543 |
ld2 r14 = [r15] |
(cmp2[1]) mov x[0] = r20 // LL+2 |
544 |
;; |
(p[LL+2])pmpyshr2.u ac1[LL+2] = r29, ac1[LL+2], 0 // LL+2 |
545 |
sxt2 r14 = r14 |
} |
546 |
;; |
{.mmi |
547 |
mov r15 = r14 |
(cmp2neg[1]) mov x[0] = r21 // LL+2 |
548 |
;; |
(cmp2[2]) add ac1[LL+3] = ac1[LL+3], r15 // LL+3 |
549 |
cmp4.ne p6, p7 = 0, r15 |
(cmp2neg[2]) sub ac1[LL+3] = ac1[LL+3], r15 // LL+3 |
550 |
;; |
|
551 |
(p7) add r14 = r16, r32 |
} |
552 |
;; |
{.mmi |
553 |
(p7) st2 [r14] = r0 |
(cmp2neg[4]) mov y1[0] = ac1[LL+5] // LL+5 |
554 |
(p7) br.cond.dpnt .L117 |
(cmp2neg[4]) mov y2[0] = x[3] // LL+5 |
555 |
cmp4.le p6, p7 = r0, r15 |
(p[LL+4])sxt2 ac1[LL+4] = ac1[LL+4] // LL+4 |
556 |
(p6) br.cond.dptk .L120 |
} |
557 |
setf.sig f8 = r15 |
{.mmi |
558 |
;; |
(cmp2[4]) mov y1[0] = x[3] // LL+5 |
559 |
xma.l f8 = f8, f6, f0 |
(cmp2[4]) mov y2[0] = ac1[LL+5] // LL+5 |
560 |
add r15 = r16, r32 |
(p[LL+6])cmp4.le cmp3[0], cmp3neg[0] = x[4], ac1[LL+6] // LL+6 |
561 |
;; |
} |
562 |
getf.sig r14 = f8 |
{.mmi |
563 |
;; |
(cmp3[1]) mov ac1[LL+7] = y1[2] // LL+7 |
564 |
sub r14 = r14, r34 |
(cmp3neg[1]) mov ac1[LL+7] = y2[2] // LL+7 |
565 |
;; |
(cmp1neg[7]) mov ac1[LL+8] = r0 // LL+8 |
566 |
sxt2 r14 = r14 |
} |
567 |
;; |
{.mbb |
568 |
cmp4.le p6, p7 = r18, r14 |
(p[LL+9])st2 [r18] = ac1[LL+9] ,2 // LL+9 |
569 |
;; |
nop.b 0x0 |
570 |
(p7) mov r14 = r18 |
br.ctop.sptk.few .L60 |
571 |
br .L124 |
;; |
572 |
;; |
} |
573 |
.L120: |
.pred.rel "clear", p36 |
574 |
setf.sig f7 = r14 |
.default |
575 |
setf.sig f8 = r34 |
mov ar.lc = r2 |
576 |
add r15 = r16, r32 |
mov ar.pfs = r9 |
577 |
;; |
mov ar.ec = r16 |
578 |
xma.l f7 = f7, f6, f8 |
mov pr = r17, -1 |
|
;; |
|
|
getf.sig r14 = f7 |
|
|
;; |
|
|
sxt2 r14 = r14 |
|
|
;; |
|
|
cmp4.le p6, p7 = r17, r14 |
|
|
;; |
|
|
(p6) mov r14 = r17 |
|
|
;; |
|
|
.L124: |
|
|
st2 [r15] = r14 |
|
|
.L117: |
|
|
adds r19 = 2, r19 |
|
|
br.cloop.sptk.few .L122 |
|
579 |
;; |
;; |
580 |
mov ar.lc = r2 |
mov ar.lc = r2 |
581 |
br.ret.sptk.many b0 |
br.ret.sptk.many b0 |
582 |
.endp dequant_inter_ia64# |
.endp dequant_h263_inter_ia64# |
583 |
.ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" |
.ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" |