1 |
|
;/* |
2 |
|
; * Simple IDCT MMX |
3 |
|
; * |
4 |
|
; * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> |
5 |
|
; * |
6 |
|
; * This library is free software; you can redistribute it and/or |
7 |
|
; * modify it under the terms of the GNU Lesser General Public |
8 |
|
; * License as published by the Free Software Foundation; either |
9 |
|
; * version 2 of the License, or (at your option) any later version. |
10 |
|
; * |
11 |
|
; * This library is distributed in the hope that it will be useful, |
12 |
|
; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
|
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 |
|
; * Lesser General Public License for more details. |
15 |
|
; * |
16 |
|
; * You should have received a copy of the GNU Lesser General Public |
17 |
|
; * License along with this library; if not, write to the Free Software |
18 |
|
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
19 |
|
; * |
20 |
|
; * Ported to nasm by Peter Ross <pross@xvid.org> |
21 |
|
; */ |
22 |
|
|
23 |
|
bits 32 |
24 |
|
|
25 |
|
|
26 |
|
;=========================================================================== |
27 |
|
; data |
28 |
|
;=========================================================================== |
29 |
|
|
30 |
|
%ifdef FORMAT_COFF |
31 |
|
section .data |
32 |
|
align 8 |
33 |
|
%else |
34 |
|
section .data data align=8 |
35 |
|
%endif |
36 |
|
|
37 |
|
wm1010 dw 0, 0xffff, 0, 0xffff |
38 |
|
d40000 dd 0x40000, 0 |
39 |
|
|
40 |
|
|
41 |
|
%define ROW_SHIFT 11 |
42 |
|
%define COL_SHIFT 20 |
43 |
|
%define C0 23170 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 23170.475006 |
44 |
|
%define C1 22725 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 22725.260826 |
45 |
|
%define C2 21407 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 21406.727617 |
46 |
|
%define C3 19266 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 19265.545870 |
47 |
|
%define C4 16383 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 = 16384.000000 |
48 |
|
%define C5 12873 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 12872.826198 |
49 |
|
%define C6 8867 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 8866.956905 |
50 |
|
%define C7 4520 ;cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 = 4520.335430 |
51 |
|
|
52 |
|
coeffs |
53 |
|
dw 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, ; 0 |
54 |
|
dw 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, ; 8 |
55 |
|
|
56 |
|
dw C4, C4, C4, C4 ; 16 |
57 |
|
dw C4, -C4, C4, -C4 ; 24 |
58 |
|
|
59 |
|
dw C2, C6, C2, C6 ; 32 |
60 |
|
dw C6, -C2, C6, -C2 ; 40 |
61 |
|
|
62 |
|
dw C1, C3, C1, C3 ; 48 |
63 |
|
dw C5, C7, C5, C7 ; 56 |
64 |
|
|
65 |
|
dw C3, -C7, C3, -C7 ; 64 |
66 |
|
dw -C1, -C5, -C1, -C5 ; 72 |
67 |
|
|
68 |
|
dw C5, -C1, C5, -C1 ; 80 |
69 |
|
dw C7, C3, C7, C3 ; 88 |
70 |
|
|
71 |
|
dw C7, -C5, C7, -C5 ; 96 |
72 |
|
dw C3, -C1, C3, -C1 ; 104 |
73 |
|
|
74 |
|
|
75 |
|
;=========================================================================== |
76 |
|
; text |
77 |
|
;=========================================================================== |
78 |
|
section .text |
79 |
|
|
80 |
|
%macro DC_COND_IDCT 8 |
81 |
|
%define src0 %1 |
82 |
|
%define src4 %2 |
83 |
|
%define src1 %3 |
84 |
|
%define src5 %4 |
85 |
|
%define dst %5 |
86 |
|
%define rounder_op %6 |
87 |
|
%define rounder_arg %7 |
88 |
|
%define shift %8 |
89 |
|
movq mm0,[src0] ; R4 R0 r4 r0 |
90 |
|
movq mm1,[src4] ; R6 R2 r6 r2 |
91 |
|
movq mm2,[src1] ; R3 R1 r3 r1 |
92 |
|
movq mm3,[src5] ; R7 R5 r7 r5 |
93 |
|
movq mm4,[wm1010] |
94 |
|
pand mm4,mm0 |
95 |
|
por mm4,mm1 |
96 |
|
por mm4,mm2 |
97 |
|
por mm4,mm3 |
98 |
|
packssdw mm4,mm4 |
99 |
|
movd eax,mm4 |
100 |
|
or eax,eax |
101 |
|
jz near .skip1 |
102 |
|
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
103 |
|
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
104 |
|
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
105 |
|
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
106 |
|
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
107 |
|
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
108 |
|
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
109 |
|
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
110 |
|
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
111 |
|
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
112 |
|
rounder_op mm4, rounder_arg |
113 |
|
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
114 |
|
paddd mm4,mm5 ; A0 a0 |
115 |
|
psubd mm6,mm5 ; A3 a3 |
116 |
|
movq mm5,[coeffs+56] ; C7 C5 C7 C5 |
117 |
|
pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5 |
118 |
|
rounder_op mm0, rounder_arg |
119 |
|
paddd mm1,mm0 ; A1 a1 |
120 |
|
paddd mm0,mm0 |
121 |
|
psubd mm0,mm1 ; A2 a2 |
122 |
|
pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 |
123 |
|
paddd mm7,mm5 ; B0 b0 |
124 |
|
movq mm5,[coeffs+72] ; -C5 -C1 -C5 -C1 |
125 |
|
pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
126 |
|
paddd mm7,mm4 ; A0+B0 a0+b0 |
127 |
|
paddd mm4,mm4 ; 2A0 2a0 |
128 |
|
psubd mm4,mm7 ; A0-B0 a0-b0 |
129 |
|
paddd mm5,mm2 ; B1 b1 |
130 |
|
psrad mm7,shift |
131 |
|
psrad mm4,shift |
132 |
|
movq mm2,mm1 ; A1 a1 |
133 |
|
paddd mm1,mm5 ; A1+B1 a1+b1 |
134 |
|
psubd mm2,mm5 ; A1-B1 a1-b1 |
135 |
|
psrad mm1,shift |
136 |
|
psrad mm2,shift |
137 |
|
packssdw mm7,mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 |
138 |
|
packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 |
139 |
|
movq [dst],mm7 |
140 |
|
movq mm1,[src1] ; R3 R1 r3 r1 |
141 |
|
movq mm4,[coeffs+80] ;-C1 C5 -C1 C5 |
142 |
|
movq [dst + 24],mm2 |
143 |
|
pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1 |
144 |
|
movq mm7,[coeffs+88] ; C3 C7 C3 C7 |
145 |
|
pmaddwd mm1,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
146 |
|
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
147 |
|
movq mm2,mm0 ; A2 a2 |
148 |
|
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
149 |
|
paddd mm4,mm7 ; B2 b2 |
150 |
|
paddd mm2,mm4 ; A2+B2 a2+b2 |
151 |
|
psubd mm0,mm4 ; a2-B2 a2-b2 |
152 |
|
psrad mm2,shift |
153 |
|
psrad mm0,shift |
154 |
|
movq mm4,mm6 ; A3 a3 |
155 |
|
paddd mm3,mm1 ; B3 b3 |
156 |
|
paddd mm6,mm3 ; A3+B3 a3+b3 |
157 |
|
psubd mm4,mm3 ; a3-B3 a3-b3 |
158 |
|
psrad mm6,shift |
159 |
|
packssdw mm2,mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 |
160 |
|
movq [ dst + 8],mm2 |
161 |
|
psrad mm4,shift |
162 |
|
packssdw mm4,mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 |
163 |
|
movq [ dst + 16],mm4 |
164 |
|
jmp short .skip2 |
165 |
|
.skip1 |
166 |
|
pslld mm0,16 |
167 |
|
paddd mm0,[d40000] |
168 |
|
psrad mm0,13 |
169 |
|
packssdw mm0,mm0 |
170 |
|
movq [ dst ],mm0 |
171 |
|
movq [ dst + 8],mm0 |
172 |
|
movq [ dst + 16],mm0 |
173 |
|
movq [ dst + 24],mm0 |
174 |
|
.skip2 |
175 |
|
%undef src0 |
176 |
|
%undef src4 |
177 |
|
%undef src1 |
178 |
|
%undef src5 |
179 |
|
%undef dst |
180 |
|
%undef rounder_op |
181 |
|
%undef rounder_arg |
182 |
|
%undef shift |
183 |
|
%endmacro |
184 |
|
|
185 |
|
|
186 |
|
|
187 |
|
%macro Z_COND_IDCT 9 |
188 |
|
%define src0 %1 |
189 |
|
%define src4 %2 |
190 |
|
%define src1 %3 |
191 |
|
%define src5 %4 |
192 |
|
%define dst %5 |
193 |
|
%define rounder_op %6 |
194 |
|
%define rounder_arg %7 |
195 |
|
%define shift %8 |
196 |
|
%define bt %9 |
197 |
|
movq mm0,[src0] ; R4 R0 r4 r0 |
198 |
|
movq mm1,[src4] ; R6 R2 r6 r2 |
199 |
|
movq mm2,[src1] ; R3 R1 r3 r1 |
200 |
|
movq mm3,[src5] ; R7 R5 r7 r5 |
201 |
|
movq mm4,mm0 |
202 |
|
por mm4,mm1 |
203 |
|
por mm4,mm2 |
204 |
|
por mm4,mm3 |
205 |
|
packssdw mm4,mm4 |
206 |
|
movd eax,mm4 |
207 |
|
or eax,eax |
208 |
|
jz near bt |
209 |
|
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
210 |
|
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
211 |
|
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
212 |
|
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
213 |
|
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
214 |
|
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
215 |
|
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
216 |
|
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
217 |
|
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
218 |
|
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
219 |
|
rounder_op mm4, rounder_arg |
220 |
|
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
221 |
|
paddd mm4,mm5 ; A0 a0 |
222 |
|
psubd mm6,mm5 ; A3 a3 |
223 |
|
movq mm5,[coeffs+56] ; C7 C5 C7 C5 |
224 |
|
pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5 |
225 |
|
rounder_op mm0, rounder_arg |
226 |
|
paddd mm1,mm0 ; A1 a1 |
227 |
|
paddd mm0,mm0 |
228 |
|
psubd mm0,mm1 ; A2 a2 |
229 |
|
pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 |
230 |
|
paddd mm7,mm5 ; B0 b0 |
231 |
|
movq mm5,[coeffs+72] ; -C5 -C1 -C5 -C1 |
232 |
|
pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
233 |
|
paddd mm7,mm4 ; A0+B0 a0+b0 |
234 |
|
paddd mm4,mm4 ; 2A0 2a0 |
235 |
|
psubd mm4,mm7 ; A0-B0 a0-b0 |
236 |
|
paddd mm5,mm2 ; B1 b1 |
237 |
|
psrad mm7,shift |
238 |
|
psrad mm4,shift |
239 |
|
movq mm2,mm1 ; A1 a1 |
240 |
|
paddd mm1,mm5 ; A1+B1 a1+b1 |
241 |
|
psubd mm2,mm5 ; A1-B1 a1-b1 |
242 |
|
psrad mm1,shift |
243 |
|
psrad mm2,shift |
244 |
|
packssdw mm7,mm1 ; A1+B1 a1+b1 A0+B0 a0+b0 |
245 |
|
packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 |
246 |
|
movq [ dst ],mm7 |
247 |
|
movq mm1,[src1] ; R3 R1 r3 r1 |
248 |
|
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 |
249 |
|
movq [ dst + 24 ],mm2 |
250 |
|
pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1 |
251 |
|
movq mm7,[coeffs+88] ; C3 C7 C3 C7 |
252 |
|
pmaddwd mm1,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
253 |
|
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
254 |
|
movq mm2,mm0 ; A2 a2 |
255 |
|
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
256 |
|
paddd mm4,mm7 ; B2 b2 |
257 |
|
paddd mm2,mm4 ; A2+B2 a2+b2 |
258 |
|
psubd mm0,mm4 ; a2-B2 a2-b2 |
259 |
|
psrad mm2,shift |
260 |
|
psrad mm0,shift |
261 |
|
movq mm4,mm6 ; A3 a3 |
262 |
|
paddd mm3,mm1 ; B3 b3 |
263 |
|
paddd mm6,mm3 ; A3+B3 a3+b3 |
264 |
|
psubd mm4,mm3 ; a3-B3 a3-b3 |
265 |
|
psrad mm6,shift |
266 |
|
packssdw mm2,mm6 ; A3+B3 a3+b3 A2+B2 a2+b2 |
267 |
|
movq [ dst + 8],mm2 |
268 |
|
psrad mm4,shift |
269 |
|
packssdw mm4,mm0 ; A2-B2 a2-b2 A3-B3 a3-b3 |
270 |
|
movq [dst + 16],mm4 |
271 |
|
%undef src0 |
272 |
|
%undef src4 |
273 |
|
%undef src1 |
274 |
|
%undef src5 |
275 |
|
%undef dst |
276 |
|
%undef rounder_op |
277 |
|
%undef rounder_arg |
278 |
|
%undef shift |
279 |
|
%undef bt |
280 |
|
%endmacro |
281 |
|
|
282 |
|
|
283 |
|
|
284 |
|
%macro IDCT0 8 |
285 |
|
%define src0 %1 |
286 |
|
%define src4 %2 |
287 |
|
%define src1 %3 |
288 |
|
%define src5 %4 |
289 |
|
%define dst %5 |
290 |
|
%define rounder_op %6 |
291 |
|
%define rounder_arg %7 |
292 |
|
%define shift %8 |
293 |
|
movq mm0,[src0] ; R4 R0 r4 r0 |
294 |
|
movq mm1,[src4] ; R6 R2 r6 r2 |
295 |
|
movq mm2,[src1] ; R3 R1 r3 r1 |
296 |
|
movq mm3,[src5] ; R7 R5 r7 r5 |
297 |
|
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
298 |
|
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
299 |
|
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
300 |
|
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
301 |
|
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
302 |
|
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
303 |
|
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
304 |
|
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
305 |
|
; rounder_op mm4, rounder_arg |
306 |
|
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
307 |
|
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
308 |
|
; rounder_op mm0, rounder_arg |
309 |
|
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
310 |
|
paddd mm4,mm5 ; A0 a0 |
311 |
|
psubd mm6,mm5 ; A3 a3 |
312 |
|
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
313 |
|
paddd mm0,mm1 ; A1 a1 |
314 |
|
psubd mm5,mm1 ; A2 a2 |
315 |
|
movq mm1,[coeffs+56] ; C7 C5 C7 C5 |
316 |
|
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
317 |
|
pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 |
318 |
|
paddd mm7,mm1 ; B0 b0 |
319 |
|
movq mm1,[coeffs+72] ; -C5 -C1 -C5 -C1 |
320 |
|
pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
321 |
|
paddd mm7,mm4 ; A0+B0 a0+b0 |
322 |
|
paddd mm4,mm4 ; 2A0 2a0 |
323 |
|
psubd mm4,mm7 ; A0-B0 a0-b0 |
324 |
|
paddd mm1,mm2 ; B1 b1 |
325 |
|
psrad mm7,shift |
326 |
|
psrad mm4,shift |
327 |
|
movq mm2,mm0 ; A1 a1 |
328 |
|
paddd mm0,mm1 ; A1+B1 a1+b1 |
329 |
|
psubd mm2,mm1 ; A1-B1 a1-b1 |
330 |
|
psrad mm0,shift |
331 |
|
psrad mm2,shift |
332 |
|
packssdw mm7,mm7 ; A0+B0 a0+b0 |
333 |
|
movd [ dst ],mm7 |
334 |
|
packssdw mm0,mm0 ; A1+B1 a1+b1 |
335 |
|
movd [ dst + 16],mm0 |
336 |
|
packssdw mm2,mm2 ; A1-B1 a1-b1 |
337 |
|
movd [ dst + 96 ],mm2 |
338 |
|
packssdw mm4,mm4 ; A0-B0 a0-b0 |
339 |
|
movd [ dst + 112],mm4 |
340 |
|
movq mm0,[src1] ; R3 R1 r3 r1 |
341 |
|
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 |
342 |
|
pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1 |
343 |
|
movq mm7,[coeffs+88] ; C3 C7 C3 C7 |
344 |
|
pmaddwd mm0,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
345 |
|
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
346 |
|
movq mm2,mm5 ; A2 a2 |
347 |
|
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
348 |
|
paddd mm4,mm7 ; B2 b2 |
349 |
|
paddd mm2,mm4 ; A2+B2 a2+b2 |
350 |
|
psubd mm5,mm4 ; a2-B2 a2-b2 |
351 |
|
psrad mm2,shift |
352 |
|
psrad mm5,shift |
353 |
|
movq mm4,mm6 ; A3 a3 |
354 |
|
paddd mm3,mm0 ; B3 b3 |
355 |
|
paddd mm6,mm3 ; A3+B3 a3+b3 |
356 |
|
psubd mm4,mm3 ; a3-B3 a3-b3 |
357 |
|
psrad mm6,shift |
358 |
|
psrad mm4,shift |
359 |
|
packssdw mm2,mm2 ; A2+B2 a2+b2 |
360 |
|
packssdw mm6,mm6 ; A3+B3 a3+b3 |
361 |
|
movd [ dst + 32 ],mm2 |
362 |
|
packssdw mm4,mm4 ; A3-B3 a3-b3 |
363 |
|
packssdw mm5,mm5 ; A2-B2 a2-b2 |
364 |
|
movd [ dst + 48 ],mm6 |
365 |
|
movd [ dst + 64 ],mm4 |
366 |
|
movd [ dst + 80 ],mm5 |
367 |
|
%undef src0 |
368 |
|
%undef src4 |
369 |
|
%undef src1 |
370 |
|
%undef src5 |
371 |
|
%undef dst |
372 |
|
%undef rounder_op |
373 |
|
%undef rounder_arg |
374 |
|
%undef shift |
375 |
|
%endmacro |
376 |
|
|
377 |
|
|
378 |
|
|
379 |
|
%macro IDCT4 8 |
380 |
|
%define src0 %1 |
381 |
|
%define src4 %2 |
382 |
|
%define src1 %3 |
383 |
|
%define src5 %4 |
384 |
|
%define dst %5 |
385 |
|
%define rounder_op %6 |
386 |
|
%define rounder_arg %7 |
387 |
|
%define shift %8 |
388 |
|
movq mm0,[src0] ; R4 R0 r4 r0 |
389 |
|
movq mm1,[src4] ; R6 R2 r6 r2 |
390 |
|
movq mm3,[src5] ; R7 R5 r7 r5 |
391 |
|
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
392 |
|
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
393 |
|
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
394 |
|
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
395 |
|
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
396 |
|
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
397 |
|
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
398 |
|
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
399 |
|
; rounder_op mm4, rounder_arg |
400 |
|
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
401 |
|
; rounder_op mm0, rounder_arg |
402 |
|
paddd mm4,mm5 ; A0 a0 |
403 |
|
psubd mm6,mm5 ; A3 a3 |
404 |
|
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
405 |
|
paddd mm0,mm1 ; A1 a1 |
406 |
|
psubd mm5,mm1 ; A2 a2 |
407 |
|
movq mm1,[coeffs+56] ; C7 C5 C7 C5 |
408 |
|
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
409 |
|
movq mm7,[coeffs+72] ; -C5 -C1 -C5 -C1 |
410 |
|
pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
411 |
|
paddd mm1,mm4 ; A0+B0 a0+b0 |
412 |
|
paddd mm4,mm4 ; 2A0 2a0 |
413 |
|
psubd mm4,mm1 ; A0-B0 a0-b0 |
414 |
|
psrad mm1,shift |
415 |
|
psrad mm4,shift |
416 |
|
movq mm2,mm0 ; A1 a1 |
417 |
|
paddd mm0,mm7 ; A1+B1 a1+b1 |
418 |
|
psubd mm2,mm7 ; A1-B1 a1-b1 |
419 |
|
psrad mm0,shift |
420 |
|
psrad mm2,shift |
421 |
|
packssdw mm1,mm1 ; A0+B0 a0+b0 |
422 |
|
movd [ dst ],mm1 |
423 |
|
packssdw mm0,mm0 ; A1+B1 a1+b1 |
424 |
|
movd [ dst + 16 ],mm0 |
425 |
|
packssdw mm2,mm2 ; A1-B1 a1-b1 |
426 |
|
movd [ dst + 96 ],mm2 |
427 |
|
packssdw mm4,mm4 ; A0-B0 a0-b0 |
428 |
|
movd [ dst + 112 ],mm4 |
429 |
|
movq mm1,[coeffs+88] ; C3 C7 C3 C7 |
430 |
|
pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5 |
431 |
|
movq mm2,mm5 ; A2 a2 |
432 |
|
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
433 |
|
paddd mm2,mm1 ; A2+B2 a2+b2 |
434 |
|
psubd mm5,mm1 ; a2-B2 a2-b2 |
435 |
|
psrad mm2,shift |
436 |
|
psrad mm5,shift |
437 |
|
movq mm1,mm6 ; A3 a3 |
438 |
|
paddd mm6,mm3 ; A3+B3 a3+b3 |
439 |
|
psubd mm1,mm3 ; a3-B3 a3-b3 |
440 |
|
psrad mm6,shift |
441 |
|
psrad mm1,shift |
442 |
|
packssdw mm2,mm2 ; A2+B2 a2+b2 |
443 |
|
packssdw mm6,mm6 ; A3+B3 a3+b3 |
444 |
|
movd [dst + 32],mm2 |
445 |
|
packssdw mm1,mm1 ; A3-B3 a3-b3 |
446 |
|
packssdw mm5,mm5 ; A2-B2 a2-b2 |
447 |
|
movd [dst + 48],mm6 |
448 |
|
movd [dst + 64],mm1 |
449 |
|
movd [dst + 80],mm5 |
450 |
|
%undef src0 |
451 |
|
%undef src4 |
452 |
|
%undef src1 |
453 |
|
%undef src5 |
454 |
|
%undef dst |
455 |
|
%undef rounder_op |
456 |
|
%undef rounder_arg |
457 |
|
%undef shift |
458 |
|
%endmacro |
459 |
|
|
460 |
|
|
461 |
|
|
462 |
|
%macro IDCT6 8 |
463 |
|
%define src0 %1 |
464 |
|
%define src4 %2 |
465 |
|
%define src1 %3 |
466 |
|
%define src5 %4 |
467 |
|
%define dst %5 |
468 |
|
%define rounder_op %6 |
469 |
|
%define rounder_arg %7 |
470 |
|
%define shift %8 |
471 |
|
movq mm0,[src0] ; R4 R0 r4 r0 |
472 |
|
movq mm3,[src5] ; R7 R5 r7 r5 |
473 |
|
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
474 |
|
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
475 |
|
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
476 |
|
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
477 |
|
; rounder_op mm4, rounder_arg |
478 |
|
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
479 |
|
; rounder_op mm0, rounder_arg |
480 |
|
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
481 |
|
movq mm1,[coeffs+56] ; C7 C5 C7 C5 |
482 |
|
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
483 |
|
movq mm7,[coeffs+72] ; -C5 -C1 -C5 -C1 |
484 |
|
pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
485 |
|
paddd mm1,mm4 ; A0+B0 a0+b0 |
486 |
|
paddd mm4,mm4 ; 2A0 2a0 |
487 |
|
psubd mm4,mm1 ; A0-B0 a0-b0 |
488 |
|
psrad mm1,shift |
489 |
|
psrad mm4,shift |
490 |
|
movq mm2,mm0 ; A1 a1 |
491 |
|
paddd mm0,mm7 ; A1+B1 a1+b1 |
492 |
|
psubd mm2,mm7 ; A1-B1 a1-b1 |
493 |
|
psrad mm0,shift |
494 |
|
psrad mm2,shift |
495 |
|
packssdw mm1,mm1 ; A0+B0 a0+b0 |
496 |
|
movd [ dst ],mm1 |
497 |
|
packssdw mm0,mm0 ; A1+B1 a1+b1 |
498 |
|
movd [ dst + 16 ],mm0 |
499 |
|
packssdw mm2,mm2 ; A1-B1 a1-b1 |
500 |
|
movd [ dst + 96 ],mm2 |
501 |
|
packssdw mm4,mm4 ; A0-B0 a0-b0 |
502 |
|
movd [ dst + 112 ],mm4 |
503 |
|
movq mm1,[coeffs+88] ; C3 C7 C3 C7 |
504 |
|
pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5 |
505 |
|
movq mm2,mm5 ; A2 a2 |
506 |
|
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
507 |
|
paddd mm2,mm1 ; A2+B2 a2+b2 |
508 |
|
psubd mm5,mm1 ; a2-B2 a2-b2 |
509 |
|
psrad mm2,shift |
510 |
|
psrad mm5,shift |
511 |
|
movq mm1,mm6 ; A3 a3 |
512 |
|
paddd mm6,mm3 ; A3+B3 a3+b3 |
513 |
|
psubd mm1,mm3 ; a3-B3 a3-b3 |
514 |
|
psrad mm6,shift |
515 |
|
psrad mm1,shift |
516 |
|
packssdw mm2,mm2 ; A2+B2 a2+b2 |
517 |
|
packssdw mm6,mm6 ; A3+B3 a3+b3 |
518 |
|
movd [dst + 32],mm2 |
519 |
|
packssdw mm1,mm1 ; A3-B3 a3-b3 |
520 |
|
packssdw mm5,mm5 ; A2-B2 a2-b2 |
521 |
|
movd [dst + 48],mm6 |
522 |
|
movd [dst + 64],mm1 |
523 |
|
movd [dst + 80],mm5 |
524 |
|
%undef src0 |
525 |
|
%undef src4 |
526 |
|
%undef src1 |
527 |
|
%undef src5 |
528 |
|
%undef dst |
529 |
|
%undef rounder_op |
530 |
|
%undef rounder_arg |
531 |
|
%undef shift |
532 |
|
%endmacro |
533 |
|
|
534 |
|
|
535 |
|
|
536 |
|
|
537 |
|
%macro IDCT2 8 |
538 |
|
%define src0 %1 |
539 |
|
%define src4 %2 |
540 |
|
%define src1 %3 |
541 |
|
%define src5 %4 |
542 |
|
%define dst %5 |
543 |
|
%define rounder_op %6 |
544 |
|
%define rounder_arg %7 |
545 |
|
%define shift %8 |
546 |
|
movq mm0,[src0] ; R4 R0 r4 r0 |
547 |
|
movq mm2,[src1] ; R3 R1 r3 r1 |
548 |
|
movq mm3,[src5] ; R7 R5 r7 r5 |
549 |
|
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
550 |
|
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
551 |
|
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
552 |
|
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
553 |
|
; rounder_op mm4, rounder_arg |
554 |
|
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
555 |
|
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
556 |
|
; rounder_op mm0, rounder_arg |
557 |
|
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
558 |
|
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
559 |
|
movq mm1,[coeffs+56] ; C7 C5 C7 C5 |
560 |
|
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
561 |
|
pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 |
562 |
|
paddd mm7,mm1 ; B0 b0 |
563 |
|
movq mm1,[coeffs+72] ; -C5 -C1 -C5 -C1 |
564 |
|
pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
565 |
|
paddd mm7,mm4 ; A0+B0 a0+b0 |
566 |
|
paddd mm4,mm4 ; 2A0 2a0 |
567 |
|
psubd mm4,mm7 ; A0-B0 a0-b0 |
568 |
|
paddd mm1,mm2 ; B1 b1 |
569 |
|
psrad mm7,shift |
570 |
|
psrad mm4,shift |
571 |
|
movq mm2,mm0 ; A1 a1 |
572 |
|
paddd mm0,mm1 ; A1+B1 a1+b1 |
573 |
|
psubd mm2,mm1 ; A1-B1 a1-b1 |
574 |
|
psrad mm0,shift |
575 |
|
psrad mm2,shift |
576 |
|
packssdw mm7,mm7 ; A0+B0 a0+b0 |
577 |
|
movd [dst],mm7 |
578 |
|
packssdw mm0,mm0 ; A1+B1 a1+b1 |
579 |
|
movd [dst + 16],mm0 |
580 |
|
packssdw mm2,mm2 ; A1-B1 a1-b1 |
581 |
|
movd [dst + 96],mm2 |
582 |
|
packssdw mm4,mm4 ; A0-B0 a0-b0 |
583 |
|
movd [dst + 112],mm4 |
584 |
|
movq mm0,[src1] ; R3 R1 r3 r1 |
585 |
|
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 |
586 |
|
pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1 |
587 |
|
movq mm7,[coeffs+88] ; C3 C7 C3 C7 |
588 |
|
pmaddwd mm0,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
589 |
|
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
590 |
|
movq mm2,mm5 ; A2 a2 |
591 |
|
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
592 |
|
paddd mm4,mm7 ; B2 b2 |
593 |
|
paddd mm2,mm4 ; A2+B2 a2+b2 |
594 |
|
psubd mm5,mm4 ; a2-B2 a2-b2 |
595 |
|
psrad mm2,shift |
596 |
|
psrad mm5,shift |
597 |
|
movq mm4,mm6 ; A3 a3 |
598 |
|
paddd mm3,mm0 ; B3 b3 |
599 |
|
paddd mm6,mm3 ; A3+B3 a3+b3 |
600 |
|
psubd mm4,mm3 ; a3-B3 a3-b3 |
601 |
|
psrad mm6,shift |
602 |
|
psrad mm4,shift |
603 |
|
packssdw mm2,mm2 ; A2+B2 a2+b2 |
604 |
|
packssdw mm6,mm6 ; A3+B3 a3+b3 |
605 |
|
movd [dst + 32],mm2 |
606 |
|
packssdw mm4,mm4 ; A3-B3 a3-b3 |
607 |
|
packssdw mm5,mm5 ; A2-B2 a2-b2 |
608 |
|
movd [dst + 48],mm6 |
609 |
|
movd [dst + 64],mm4 |
610 |
|
movd [dst + 80],mm5 |
611 |
|
%undef src0 |
612 |
|
%undef src4 |
613 |
|
%undef src1 |
614 |
|
%undef src5 |
615 |
|
%undef dst |
616 |
|
%undef rounder_op |
617 |
|
%undef rounder_arg |
618 |
|
%undef shift |
619 |
|
%endmacro |
620 |
|
|
621 |
|
|
622 |
|
|
623 |
|
%macro IDCT3 8 |
624 |
|
%define src0 %1 |
625 |
|
%define src4 %2 |
626 |
|
%define src1 %3 |
627 |
|
%define src5 %4 |
628 |
|
%define dst %5 |
629 |
|
%define rounder_op %6 |
630 |
|
%define rounder_arg %7 |
631 |
|
%define shift %8 |
632 |
|
movq mm0,[src0] ; R4 R0 r4 r0 |
633 |
|
movq mm2,[src1] ; R3 R1 r3 r1 |
634 |
|
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
635 |
|
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
636 |
|
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
637 |
|
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
638 |
|
; rounder_op mm4, rounder_arg |
639 |
|
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
640 |
|
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
641 |
|
; rounder_op mm0, rounder_arg |
642 |
|
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
643 |
|
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
644 |
|
movq mm3,[coeffs+64] |
645 |
|
pmaddwd mm3,mm2 ; -C7R3+C3R1 -C7r3+C3r1 |
646 |
|
paddd mm7,mm4 ; A0+B0 a0+b0 |
647 |
|
paddd mm4,mm4 ; 2A0 2a0 |
648 |
|
psubd mm4,mm7 ; A0-B0 a0-b0 |
649 |
|
psrad mm7,shift |
650 |
|
psrad mm4,shift |
651 |
|
movq mm1,mm0 ; A1 a1 |
652 |
|
paddd mm0,mm3 ; A1+B1 a1+b1 |
653 |
|
psubd mm1,mm3 ; A1-B1 a1-b1 |
654 |
|
psrad mm0,shift |
655 |
|
psrad mm1,shift |
656 |
|
packssdw mm7,mm7 ; A0+B0 a0+b0 |
657 |
|
movd [dst],mm7 |
658 |
|
packssdw mm0,mm0 ; A1+B1 a1+b1 |
659 |
|
movd [dst + 16],mm0 |
660 |
|
packssdw mm1,mm1 ; A1-B1 a1-b1 |
661 |
|
movd [dst + 96],mm1 |
662 |
|
packssdw mm4,mm4 ; A0-B0 a0-b0 |
663 |
|
movd [dst + 112],mm4 |
664 |
|
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 |
665 |
|
pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1 |
666 |
|
pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
667 |
|
movq mm1,mm5 ; A2 a2 |
668 |
|
paddd mm1,mm4 ; A2+B2 a2+b2 |
669 |
|
psubd mm5,mm4 ; a2-B2 a2-b2 |
670 |
|
psrad mm1,shift |
671 |
|
psrad mm5,shift |
672 |
|
movq mm4,mm6 ; A3 a3 |
673 |
|
paddd mm6,mm2 ; A3+B3 a3+b3 |
674 |
|
psubd mm4,mm2 ; a3-B3 a3-b3 |
675 |
|
psrad mm6,shift |
676 |
|
psrad mm4,shift |
677 |
|
packssdw mm1,mm1 ; A2+B2 a2+b2 |
678 |
|
packssdw mm6,mm6 ; A3+B3 a3+b3 |
679 |
|
movd [dst + 32],mm1 |
680 |
|
packssdw mm4,mm4 ; A3-B3 a3-b3 |
681 |
|
packssdw mm5,mm5 ; A2-B2 a2-b2 |
682 |
|
movd [dst + 48],mm6 |
683 |
|
movd [dst + 64],mm4 |
684 |
|
movd [dst + 80],mm5 |
685 |
|
%undef src0 |
686 |
|
%undef src4 |
687 |
|
%undef src1 |
688 |
|
%undef src5 |
689 |
|
%undef dst |
690 |
|
%undef rounder_op |
691 |
|
%undef rounder_arg |
692 |
|
%undef shift |
693 |
|
%endmacro |
694 |
|
|
695 |
|
|
696 |
|
|
697 |
|
%macro IDCT5 8 |
698 |
|
%define src0 %1 |
699 |
|
%define src4 %2 |
700 |
|
%define src1 %3 |
701 |
|
%define src5 %4 |
702 |
|
%define dst %5 |
703 |
|
%define rounder_op %6 |
704 |
|
%define rounder_arg %7 |
705 |
|
%define shift %8 |
706 |
|
movq mm0,[src0] ; R4 R0 r4 r0 |
707 |
|
movq mm1,[src4] ; R6 R2 r6 r2 |
708 |
|
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
709 |
|
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
710 |
|
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
711 |
|
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
712 |
|
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
713 |
|
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
714 |
|
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
715 |
|
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
716 |
|
; rounder_op mm4, rounder_arg |
717 |
|
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
718 |
|
paddd mm4,mm5 ; A0 a0 |
719 |
|
; rounder_op mm0, rounder_arg |
720 |
|
psubd mm6,mm5 ; A3 a3 |
721 |
|
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
722 |
|
paddd mm0,mm1 ; A1 a1 |
723 |
|
psubd mm5,mm1 ; A2 a2 |
724 |
|
movq mm2,[src0 + 8] ; R4 R0 r4 r0 |
725 |
|
movq mm3,[src4 + 8] ; R6 R2 r6 r2 |
726 |
|
movq mm1,[coeffs+16] ; C4 C4 C4 C4 |
727 |
|
pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0 |
728 |
|
movq mm7,[coeffs+24] ; -C4 C4 -C4 C4 |
729 |
|
pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0 |
730 |
|
movq mm7,[coeffs+32] ; C6 C2 C6 C2 |
731 |
|
pmaddwd mm7,mm3 ; C6R6+C2R2 C6r6+C2r2 |
732 |
|
pmaddwd mm3,[coeffs+40] ; -C2R6+C6R2 -C2r6+C6r2 |
733 |
|
; rounder_op mm1, rounder_arg |
734 |
|
paddd mm7,mm1 ; A0 a0 |
735 |
|
paddd mm1,mm1 ; 2C0 2c0 |
736 |
|
; rounder_op mm2, rounder_arg |
737 |
|
psubd mm1,mm7 ; A3 a3 |
738 |
|
paddd mm3,mm2 ; A1 a1 |
739 |
|
paddd mm2,mm2 ; 2C1 2c1 |
740 |
|
psubd mm2,mm3 ; A2 a2 |
741 |
|
psrad mm4,shift |
742 |
|
psrad mm7,shift |
743 |
|
psrad mm3,shift |
744 |
|
packssdw mm4,mm7 ; A0 a0 |
745 |
|
movq [dst],mm4 |
746 |
|
psrad mm0,shift |
747 |
|
packssdw mm0,mm3 ; A1 a1 |
748 |
|
movq [dst + 16],mm0 |
749 |
|
movq [dst + 96],mm0 |
750 |
|
movq [dst + 112],mm4 |
751 |
|
psrad mm5,shift |
752 |
|
psrad mm6,shift |
753 |
|
psrad mm2,shift |
754 |
|
packssdw mm5,mm2 ; A2-B2 a2-b2 |
755 |
|
movq [dst + 32],mm5 |
756 |
|
psrad mm1,shift |
757 |
|
packssdw mm6,mm1 ; A3+B3 a3+b3 |
758 |
|
movq [dst + 48],mm6 |
759 |
|
movq [dst + 64],mm6 |
760 |
|
movq [dst + 80],mm5 |
761 |
|
%undef src0 |
762 |
|
%undef src4 |
763 |
|
%undef src1 |
764 |
|
%undef src5 |
765 |
|
%undef dst |
766 |
|
%undef rounder_op |
767 |
|
%undef rounder_arg |
768 |
|
%undef shift |
769 |
|
%endmacro |
770 |
|
|
771 |
|
|
772 |
|
%macro IDCT1 8 |
773 |
|
%define src0 %1 |
774 |
|
%define src4 %2 |
775 |
|
%define src1 %3 |
776 |
|
%define src5 %4 |
777 |
|
%define dst %5 |
778 |
|
%define rounder_op %6 |
779 |
|
%define rounder_arg %7 |
780 |
|
%define shift %8 |
781 |
|
movq mm0,[src0] ; R4 R0 r4 r0 |
782 |
|
movq mm1,[src4] ; R6 R2 r6 r2 |
783 |
|
movq mm2,[src1] ; R3 R1 r3 r1 |
784 |
|
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
785 |
|
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
786 |
|
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
787 |
|
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
788 |
|
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
789 |
|
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
790 |
|
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
791 |
|
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
792 |
|
; rounder_op mm4, rounder_arg |
793 |
|
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
794 |
|
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
795 |
|
; rounder_op mm0, rounder_arg |
796 |
|
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
797 |
|
paddd mm4,mm5 ; A0 a0 |
798 |
|
psubd mm6,mm5 ; A3 a3 |
799 |
|
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
800 |
|
paddd mm0,mm1 ; A1 a1 |
801 |
|
psubd mm5,mm1 ; A2 a2 |
802 |
|
movq mm1,[coeffs+64] |
803 |
|
pmaddwd mm1,mm2 ; -C7R3+C3R1 -C7r3+C3r1 |
804 |
|
paddd mm7,mm4 ; A0+B0 a0+b0 |
805 |
|
paddd mm4,mm4 ; 2A0 2a0 |
806 |
|
psubd mm4,mm7 ; A0-B0 a0-b0 |
807 |
|
psrad mm7,shift |
808 |
|
psrad mm4,shift |
809 |
|
movq mm3,mm0 ; A1 a1 |
810 |
|
paddd mm0,mm1 ; A1+B1 a1+b1 |
811 |
|
psubd mm3,mm1 ; A1-B1 a1-b1 |
812 |
|
psrad mm0,shift |
813 |
|
psrad mm3,shift |
814 |
|
packssdw mm7,mm7 ; A0+B0 a0+b0 |
815 |
|
movd [dst],mm7 |
816 |
|
packssdw mm0,mm0 ; A1+B1 a1+b1 |
817 |
|
movd [dst + 16],mm0 |
818 |
|
packssdw mm3,mm3 ; A1-B1 a1-b1 |
819 |
|
movd [dst + 96],mm3 |
820 |
|
packssdw mm4,mm4 ; A0-B0 a0-b0 |
821 |
|
movd [dst + 112],mm4 |
822 |
|
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 |
823 |
|
pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1 |
824 |
|
pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
825 |
|
movq mm3,mm5 ; A2 a2 |
826 |
|
paddd mm3,mm4 ; A2+B2 a2+b2 |
827 |
|
psubd mm5,mm4 ; a2-B2 a2-b2 |
828 |
|
psrad mm3,shift |
829 |
|
psrad mm5,shift |
830 |
|
movq mm4,mm6 ; A3 a3 |
831 |
|
paddd mm6,mm2 ; A3+B3 a3+b3 |
832 |
|
psubd mm4,mm2 ; a3-B3 a3-b3 |
833 |
|
psrad mm6,shift |
834 |
|
packssdw mm3,mm3 ; A2+B2 a2+b2 |
835 |
|
movd [dst + 32],mm3 |
836 |
|
psrad mm4,shift |
837 |
|
packssdw mm6,mm6 ; A3+B3 a3+b3 |
838 |
|
movd [dst + 48],mm6 |
839 |
|
packssdw mm4,mm4 ; A3-B3 a3-b3 |
840 |
|
packssdw mm5,mm5 ; A2-B2 a2-b2 |
841 |
|
movd [dst + 64],mm4 |
842 |
|
movd [dst + 80],mm5 |
843 |
|
%undef src0 |
844 |
|
%undef src4 |
845 |
|
%undef src1 |
846 |
|
%undef src5 |
847 |
|
%undef dst |
848 |
|
%undef rounder_op |
849 |
|
%undef rounder_arg |
850 |
|
%undef shift |
851 |
|
%endmacro |
852 |
|
|
853 |
|
|
854 |
|
|
855 |
|
|
856 |
|
%macro IDCT7 8 |
857 |
|
%define src0 %1 |
858 |
|
%define src4 %2 |
859 |
|
%define src1 %3 |
860 |
|
%define src5 %4 |
861 |
|
%define dst %5 |
862 |
|
%define rounder_op %6 |
863 |
|
%define rounder_arg %7 |
864 |
|
%define shift %8 |
865 |
|
movq mm0,[src0] ; R4 R0 r4 r0 |
866 |
|
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
867 |
|
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
868 |
|
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
869 |
|
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
870 |
|
; rounder_op mm4, rounder_arg |
871 |
|
; rounder_op mm0, rounder_arg |
872 |
|
psrad mm4,shift |
873 |
|
psrad mm0,shift |
874 |
|
movq mm2,[src0 + 8] ; R4 R0 r4 r0 |
875 |
|
movq mm1,[coeffs+16] ; C4 C4 C4 C4 |
876 |
|
pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0 |
877 |
|
movq mm7,[coeffs+24] ; -C4 C4 -C4 C4 |
878 |
|
pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0 |
879 |
|
movq mm7,[coeffs+32] ; C6 C2 C6 C2 |
880 |
|
; rounder_op mm1, rounder_arg |
881 |
|
; rounder_op mm2, rounder_arg |
882 |
|
psrad mm1,shift |
883 |
|
packssdw mm4,mm1 ; A0 a0 |
884 |
|
movq [dst],mm4 |
885 |
|
psrad mm2,shift |
886 |
|
packssdw mm0,mm2 ; A1 a1 |
887 |
|
movq [dst + 16],mm0 |
888 |
|
movq [dst + 96],mm0 |
889 |
|
movq [dst + 112],mm4 |
890 |
|
movq [dst + 32],mm0 |
891 |
|
movq [dst + 48],mm4 |
892 |
|
movq [dst + 64],mm4 |
893 |
|
movq [dst + 80],mm0 |
894 |
|
%undef src0 |
895 |
|
%undef src4 |
896 |
|
%undef src1 |
897 |
|
%undef src5 |
898 |
|
%undef dst |
899 |
|
%undef rounder_op |
900 |
|
%undef rounder_arg |
901 |
|
%undef shift |
902 |
|
%endmacro |
903 |
|
|
904 |
|
|
905 |
|
|
906 |
|
%macro cglobal 1 |
907 |
|
%ifdef PREFIX |
908 |
|
global _%1 |
909 |
|
%define %1 _%1 |
910 |
|
%else |
911 |
|
global %1 |
912 |
|
%endif |
913 |
|
%endmacro |
914 |
|
|
915 |
|
|
916 |
|
; void simple_idct_mmx(int16_t * const block); |
917 |
|
align 16 |
918 |
|
cglobal simple_idct_mmx |
919 |
|
simple_idct_mmx |
920 |
|
sub esp, 128 |
921 |
|
mov edx, [esp+128+4] |
922 |
|
|
923 |
|
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt |
924 |
|
|
925 |
|
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 |
926 |
|
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .four |
927 |
|
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .two |
928 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .one |
929 |
|
IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
930 |
|
IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
931 |
|
IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
932 |
|
IDCT0 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
933 |
|
jmp .ret |
934 |
|
|
935 |
|
align 16 |
936 |
|
.four |
937 |
|
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .six |
938 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .five |
939 |
|
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
940 |
|
IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
941 |
|
IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
942 |
|
IDCT4 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
943 |
|
jmp .ret |
944 |
|
|
945 |
|
align 16 |
946 |
|
.six |
947 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .seven |
948 |
|
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
949 |
|
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
950 |
|
IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
951 |
|
IDCT6 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
952 |
|
jmp .ret |
953 |
|
|
954 |
|
align 16 |
955 |
|
.two |
956 |
|
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .three |
957 |
|
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
958 |
|
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
959 |
|
IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
960 |
|
IDCT2 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
961 |
|
jmp .ret |
962 |
|
|
963 |
|
align 16 |
964 |
|
.three |
965 |
|
IDCT3 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
966 |
|
IDCT3 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
967 |
|
IDCT3 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
968 |
|
IDCT3 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
969 |
|
jmp .ret |
970 |
|
|
971 |
|
align 16 |
972 |
|
.five |
973 |
|
IDCT5 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
974 |
|
; IDCT5 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
975 |
|
IDCT5 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
976 |
|
; IDCT5 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
977 |
|
jmp .ret |
978 |
|
|
979 |
|
align 16 |
980 |
|
.one |
981 |
|
IDCT1 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
982 |
|
IDCT1 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
983 |
|
IDCT1 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
984 |
|
IDCT1 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
985 |
|
jmp .ret |
986 |
|
|
987 |
|
align 16 |
988 |
|
.seven |
989 |
|
IDCT7 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
990 |
|
; IDCT7 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
991 |
|
IDCT7 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
992 |
|
; IDCT7 esp+24, esp+88, esp+56, esp+120,edx+12, nop, 0, 20 |
993 |
|
|
994 |
|
.ret |
995 |
|
add esp, 128 |
996 |
|
ret |
997 |
|
|
998 |
|
|