54 |
; Athlon optimizations contributed by Jaan Kalda |
; Athlon optimizations contributed by Jaan Kalda |
55 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
56 |
|
|
|
BITS 32 |
|
|
|
|
57 |
;============================================================================= |
;============================================================================= |
58 |
; Macros and other preprocessor constants |
; Macros and other preprocessor constants |
59 |
;============================================================================= |
;============================================================================= |
60 |
|
|
61 |
%macro cglobal 1 |
%include "nasm.inc" |
|
%ifdef PREFIX |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%else |
|
|
global %1 |
|
|
%endif |
|
|
%endmacro |
|
62 |
|
|
63 |
%define BITS_INV_ACC 5 ; 4 or 5 for IEEE |
%define BITS_INV_ACC 5 ; 4 or 5 for IEEE |
64 |
%define SHIFT_INV_ROW 16 - BITS_INV_ACC |
%define SHIFT_INV_ROW 16 - BITS_INV_ACC |
76 |
; Local Data (Read Only) |
; Local Data (Read Only) |
77 |
;============================================================================= |
;============================================================================= |
78 |
|
|
79 |
%ifdef FORMAT_COFF |
DATA |
|
SECTION .rodata |
|
|
%else |
|
|
SECTION .rodata align=16 |
|
|
%endif |
|
80 |
|
|
81 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
82 |
; Various memory constants (trigonometric values or rounding values) |
; Various memory constants (trigonometric values or rounding values) |
83 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
84 |
|
|
85 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
86 |
one_corr: |
one_corr: |
87 |
dw 1, 1, 1, 1 |
dw 1, 1, 1, 1 |
88 |
round_inv_row: |
round_inv_row: |
199 |
; Code |
; Code |
200 |
;============================================================================= |
;============================================================================= |
201 |
|
|
202 |
SECTION .text |
TEXT |
203 |
|
|
204 |
cglobal idct_3dne |
cglobal idct_3dne |
205 |
|
|
207 |
; void idct_3dne(uint16_t block[64]); |
; void idct_3dne(uint16_t block[64]); |
208 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
209 |
|
|
210 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
211 |
idct_3dne: |
idct_3dne: |
212 |
mov eax, [esp+4] |
mov _ECX, prm1 |
213 |
|
|
214 |
; DCT_8_INV_ROW_1_s [eax+64], [eax+64], tab_i_04_sse, rounder_4 ;rounder_4=0 |
; DCT_8_INV_ROW_1_s [_ECX+64], [_ECX+64], tab_i_04_sse, rounder_4 ;rounder_4=0 |
215 |
pshufw mm0, [eax+64],10001000b ; x2 x0 x2 x0 |
pshufw mm0, [_ECX+64],10001000b ; x2 x0 x2 x0 |
216 |
movq mm3, [tab_i_04_xmm] ; 3 ; w05 w04 w01 w00 |
movq mm3, [tab_i_04_xmm] ; 3 ; w05 w04 w01 w00 |
217 |
pshufw mm1, [eax+64+8],10001000b ; x6 x4 x6 x4 |
pshufw mm1, [_ECX+64+8],10001000b ; x6 x4 x6 x4 |
218 |
movq mm4, [tab_i_04_xmm+8] ; 4 ; w07 w06 w03 w02 |
movq mm4, [tab_i_04_xmm+8] ; 4 ; w07 w06 w03 w02 |
219 |
pshufw mm2, [eax+64],11011101b ; x3 x1 x3 x1 |
pshufw mm2, [_ECX+64],11011101b ; x3 x1 x3 x1 |
220 |
pshufw mm5, [eax+64+8],11011101b ; x7 x5 x7 x5 |
pshufw mm5, [_ECX+64+8],11011101b ; x7 x5 x7 x5 |
221 |
movq mm6, [tab_i_04_xmm+32] ; 6 ; w21 w20 w17 w16 |
movq mm6, [tab_i_04_xmm+32] ; 6 ; w21 w20 w17 w16 |
222 |
pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 |
pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 |
223 |
movq mm7, [tab_i_04_xmm+40] ; 7 ; w23 w22 w19 w18 ; |
movq mm7, [tab_i_04_xmm+40] ; 7 ; w23 w22 w19 w18 ; |
230 |
pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
231 |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
232 |
paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
233 |
pshufw mm1, [eax+80+8],10001000b ; x6 x4 x6 x4 |
pshufw mm1, [_ECX+80+8],10001000b ; x6 x4 x6 x4 |
234 |
movq mm4, mm3 ; 4 ; a1 a0 |
movq mm4, mm3 ; 4 ; a1 a0 |
235 |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
236 |
paddd mm2, mm5 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
paddd mm2, mm5 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
237 |
pshufw mm5, [eax+80],10001000b; x2 x0 x2 x0 mm5 & mm0 exchanged for next cycle |
pshufw mm5, [_ECX+80],10001000b; x2 x0 x2 x0 mm5 & mm0 exchanged for next cycle |
238 |
movq mm7, mm0 ; 7 ; a3 a2 |
movq mm7, mm0 ; 7 ; a3 a2 |
239 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 |
240 |
paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
241 |
movq mm3, [tab_i_35_xmm] ; 3 ; w05 w04 w01 w00 |
movq mm3, [tab_i_35_xmm] ; 3 ; w05 w04 w01 w00 |
242 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
243 |
paddd mm0, mm2 ; 0 free a3+b3 a2+b2 |
paddd mm0, mm2 ; 0 free a3+b3 a2+b2 |
244 |
pshufw mm2, [eax+80],11011101b; x3 x1 x3 x1 |
pshufw mm2, [_ECX+80],11011101b; x3 x1 x3 x1 |
245 |
pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 |
pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 |
246 |
pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
247 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
250 |
psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
251 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
252 |
packssdw mm6, mm0 ; 0 free ; y3 y2 y1 y0 |
packssdw mm6, mm0 ; 0 free ; y3 y2 y1 y0 |
253 |
pshufw mm0, [eax+80+8],11011101b ; x7 x5 x7 x5 |
pshufw mm0, [_ECX+80+8],11011101b ; x7 x5 x7 x5 |
254 |
movq [eax+64], mm6 ; 3 ; save y3 y2 y1 y0 stall2 |
movq [_ECX+64], mm6 ; 3 ; save y3 y2 y1 y0 stall2 |
255 |
|
|
256 |
; DCT_8_INV_ROW_1_s [eax+80], [eax+80], tab_i_35_xmm, rounder_5 |
; DCT_8_INV_ROW_1_s [_ECX+80], [_ECX+80], tab_i_35_xmm, rounder_5 |
257 |
movq mm4, [tab_i_35_xmm+8] ; 4 ; w07 w06 w03 w02 |
movq mm4, [tab_i_35_xmm+8] ; 4 ; w07 w06 w03 w02 |
258 |
movq mm6, [tab_i_35_xmm+32] ; 6 ; w21 w20 w17 w16 |
movq mm6, [tab_i_35_xmm+32] ; 6 ; w21 w20 w17 w16 |
259 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 |
260 |
paddd mm3, [rounder_5] ; +rounder stall 6 |
paddd mm3, [rounder_5] ; +rounder stall 6 |
261 |
paddd mm5, [rounder_5] ; +rounder |
paddd mm5, [rounder_5] ; +rounder |
262 |
movq [eax+64+8], mm7 ; 7 ; save y7 y6 y5 y4 |
movq [_ECX+64+8], mm7 ; 7 ; save y7 y6 y5 y4 |
263 |
movq mm7, [tab_i_35_xmm+40] ; 7 ; w23 w22 w19 w18 |
movq mm7, [tab_i_35_xmm+40] ; 7 ; w23 w22 w19 w18 |
264 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
265 |
pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
269 |
pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
270 |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
271 |
paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
272 |
pshufw mm1, [eax+96+8],10001000b ; x6 x4 x6 x4 |
pshufw mm1, [_ECX+96+8],10001000b ; x6 x4 x6 x4 |
273 |
movq mm4, mm3 ; 4 ; a1 a0 |
movq mm4, mm3 ; 4 ; a1 a0 |
274 |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
275 |
paddd mm2, mm0 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
paddd mm2, mm0 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
276 |
pshufw mm0, [eax+96],10001000b ; x2 x0 x2 x0 |
pshufw mm0, [_ECX+96],10001000b ; x2 x0 x2 x0 |
277 |
movq mm7, mm5 ; 7 ; a3 a2 |
movq mm7, mm5 ; 7 ; a3 a2 |
278 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 |
279 |
paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
280 |
movq mm3, [tab_i_26_xmm] ; 3 ; w05 w04 w01 w00 |
movq mm3, [tab_i_26_xmm] ; 3 ; w05 w04 w01 w00 |
281 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
282 |
paddd mm5, mm2 ; 0 free a3+b3 a2+b2 |
paddd mm5, mm2 ; 0 free a3+b3 a2+b2 |
283 |
pshufw mm2, [eax+96],11011101b; x3 x1 x3 x1 |
pshufw mm2, [_ECX+96],11011101b; x3 x1 x3 x1 |
284 |
pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 |
pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 |
285 |
pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
286 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
289 |
psrad mm5, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
psrad mm5, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
290 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
291 |
packssdw mm6, mm5 ; 0 free ; y3 y2 y1 y0 |
packssdw mm6, mm5 ; 0 free ; y3 y2 y1 y0 |
292 |
pshufw mm5, [eax+96+8],11011101b ; x7 x5 x7 x5 |
pshufw mm5, [_ECX+96+8],11011101b ; x7 x5 x7 x5 |
293 |
movq [eax+80], mm6 ; 3 ; save y3 y2 y1 y0 |
movq [_ECX+80], mm6 ; 3 ; save y3 y2 y1 y0 |
294 |
|
|
295 |
; DCT_8_INV_ROW_1_s [eax+96], [eax+96], tab_i_26_xmm, rounder_6 |
; DCT_8_INV_ROW_1_s [_ECX+96], [_ECX+96], tab_i_26_xmm, rounder_6 |
296 |
movq mm4, [tab_i_26_xmm+8] ; 4 ; w07 w06 w03 w02 |
movq mm4, [tab_i_26_xmm+8] ; 4 ; w07 w06 w03 w02 |
297 |
movq mm6, [tab_i_26_xmm+32] ; 6 ; w21 w20 w17 w16 |
movq mm6, [tab_i_26_xmm+32] ; 6 ; w21 w20 w17 w16 |
298 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 |
299 |
paddd mm3, [rounder_6] ; +rounder |
paddd mm3, [rounder_6] ; +rounder |
300 |
paddd mm0, [rounder_6] ; +rounder |
paddd mm0, [rounder_6] ; +rounder |
301 |
movq [eax+80+8], mm7 ; 7 ; save y7 y6 |
movq [_ECX+80+8], mm7 ; 7 ; save y7 y6 |
302 |
movq mm7, [tab_i_26_xmm+40] ; 7 ; w23 w22 w19 w18 |
movq mm7, [tab_i_26_xmm+40] ; 7 ; w23 w22 w19 w18 |
303 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
304 |
pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
308 |
pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
309 |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
310 |
paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
311 |
pshufw mm1, [eax+112+8],10001000b ; x6 x4 x6 x4 |
pshufw mm1, [_ECX+112+8],10001000b ; x6 x4 x6 x4 |
312 |
movq mm4, mm3 ; 4 ; a1 a0 |
movq mm4, mm3 ; 4 ; a1 a0 |
313 |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
314 |
paddd mm2, mm5 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
paddd mm2, mm5 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
315 |
pshufw mm5, [eax+112],10001000b; x2 x0 x2 x0 mm5 & mm0 exchanged for next cycle |
pshufw mm5, [_ECX+112],10001000b; x2 x0 x2 x0 mm5 & mm0 exchanged for next cycle |
316 |
movq mm7, mm0 ; 7 ; a3 a2 |
movq mm7, mm0 ; 7 ; a3 a2 |
317 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 |
318 |
paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
319 |
movq mm3, [tab_i_17_xmm] ; 3 ; w05 w04 w01 w00 |
movq mm3, [tab_i_17_xmm] ; 3 ; w05 w04 w01 w00 |
320 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
321 |
paddd mm0, mm2 ; 0 free a3+b3 a2+b2 |
paddd mm0, mm2 ; 0 free a3+b3 a2+b2 |
322 |
pshufw mm2, [eax+112],11011101b; x3 x1 x3 x1 |
pshufw mm2, [_ECX+112],11011101b; x3 x1 x3 x1 |
323 |
pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 |
pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 |
324 |
pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
325 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
328 |
psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
329 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
330 |
packssdw mm6, mm0 ; 0 free ; y3 y2 y1 y0 |
packssdw mm6, mm0 ; 0 free ; y3 y2 y1 y0 |
331 |
pshufw mm0, [eax+112+8],11011101b ; x7 x5 x7 x5 |
pshufw mm0, [_ECX+112+8],11011101b ; x7 x5 x7 x5 |
332 |
movq [eax+96], mm6 ; 3 ; save y3 y2 y1 y0 stall2 |
movq [_ECX+96], mm6 ; 3 ; save y3 y2 y1 y0 stall2 |
333 |
|
|
334 |
; DCT_8_INV_ROW_1_s [eax+112], [eax+112], tab_i_17_xmm, rounder_7 |
; DCT_8_INV_ROW_1_s [_ECX+112], [_ECX+112], tab_i_17_xmm, rounder_7 |
335 |
movq mm4, [tab_i_17_xmm+8] ; 4 ; w07 w06 w03 w02 |
movq mm4, [tab_i_17_xmm+8] ; 4 ; w07 w06 w03 w02 |
336 |
movq mm6, [tab_i_17_xmm+32] ; 6 ; w21 w20 w17 w16 |
movq mm6, [tab_i_17_xmm+32] ; 6 ; w21 w20 w17 w16 |
337 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 |
338 |
paddd mm3, [rounder_7] ; +rounder stall 6 |
paddd mm3, [rounder_7] ; +rounder stall 6 |
339 |
paddd mm5, [rounder_7] ; +rounder |
paddd mm5, [rounder_7] ; +rounder |
340 |
movq [eax+96+8], mm7 ; 7 ; save y7 y6 y5 y4 |
movq [_ECX+96+8], mm7 ; 7 ; save y7 y6 y5 y4 |
341 |
movq mm7, [tab_i_17_xmm+40] ; 7 ; w23 w22 w19 w18 |
movq mm7, [tab_i_17_xmm+40] ; 7 ; w23 w22 w19 w18 |
342 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
343 |
pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
347 |
pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
348 |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
349 |
paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
350 |
pshufw mm1, [eax+0+8],10001000b; x6 x4 x6 x4 |
pshufw mm1, [_ECX+0+8],10001000b; x6 x4 x6 x4 |
351 |
movq mm4, mm3 ; 4 ; a1 a0 |
movq mm4, mm3 ; 4 ; a1 a0 |
352 |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
353 |
paddd mm2, mm0 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
paddd mm2, mm0 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
354 |
pshufw mm0, [eax+0],10001000b ; x2 x0 x2 x0 |
pshufw mm0, [_ECX+0],10001000b ; x2 x0 x2 x0 |
355 |
movq mm7, mm5 ; 7 ; a3 a2 |
movq mm7, mm5 ; 7 ; a3 a2 |
356 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 |
357 |
paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
358 |
movq mm3, [tab_i_04_xmm] ; 3 ; w05 w04 w01 w00 |
movq mm3, [tab_i_04_xmm] ; 3 ; w05 w04 w01 w00 |
359 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
360 |
paddd mm5, mm2 ; 0 free a3+b3 a2+b2 |
paddd mm5, mm2 ; 0 free a3+b3 a2+b2 |
361 |
pshufw mm2, [eax+0],11011101b ; x3 x1 x3 x1 |
pshufw mm2, [_ECX+0],11011101b ; x3 x1 x3 x1 |
362 |
pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 |
pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 |
363 |
pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
364 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
367 |
psrad mm5, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
psrad mm5, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
368 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
369 |
packssdw mm6, mm5 ; 0 free ; y3 y2 y1 y0 |
packssdw mm6, mm5 ; 0 free ; y3 y2 y1 y0 |
370 |
pshufw mm5, [eax+0+8],11011101b; x7 x5 x7 x5 |
pshufw mm5, [_ECX+0+8],11011101b; x7 x5 x7 x5 |
371 |
movq [eax+112], mm6 ; 3 ; save y3 y2 y1 y0 |
movq [_ECX+112], mm6 ; 3 ; save y3 y2 y1 y0 |
372 |
|
|
373 |
; DCT_8_INV_ROW_1_s [eax+0], 0, tab_i_04_xmm, rounder_0 |
; DCT_8_INV_ROW_1_s [_ECX+0], 0, tab_i_04_xmm, rounder_0 |
374 |
movq mm4, [tab_i_04_xmm+8] ; 4 ; w07 w06 w03 w02 |
movq mm4, [tab_i_04_xmm+8] ; 4 ; w07 w06 w03 w02 |
375 |
movq mm6, [tab_i_04_xmm+32] ; 6 ; w21 w20 w17 w16 |
movq mm6, [tab_i_04_xmm+32] ; 6 ; w21 w20 w17 w16 |
376 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 |
377 |
paddd mm3, [rounder_0] ; +rounder |
paddd mm3, [rounder_0] ; +rounder |
378 |
paddd mm0, [rounder_0] ; +rounder |
paddd mm0, [rounder_0] ; +rounder |
379 |
movq [eax+112+8], mm7 ; 7 ; save y7 y6 |
movq [_ECX+112+8], mm7 ; 7 ; save y7 y6 |
380 |
movq mm7, [tab_i_04_xmm+40] ; 7 ; w23 w22 w19 w18 |
movq mm7, [tab_i_04_xmm+40] ; 7 ; w23 w22 w19 w18 |
381 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
382 |
pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
386 |
pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
387 |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
388 |
paddd mm0, mm1 ; 1 |
paddd mm0, mm1 ; 1 |
389 |
pshufw mm1, [eax+16+8],10001000b ; x6 x4 x6 x4 |
pshufw mm1, [_ECX+16+8],10001000b ; x6 x4 x6 x4 |
390 |
movq mm4, mm3 ; 4 ; a1 a0 |
movq mm4, mm3 ; 4 ; a1 a0 |
391 |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
392 |
paddd mm2, mm5 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
paddd mm2, mm5 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
393 |
pshufw mm5, [eax+16],10001000b; x2 x0 x2 x0 mm5 & mm0 exchanged for next cycle |
pshufw mm5, [_ECX+16],10001000b; x2 x0 x2 x0 mm5 & mm0 exchanged for next cycle |
394 |
movq mm7, mm0 ; 7 ; a3 a2 |
movq mm7, mm0 ; 7 ; a3 a2 |
395 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 |
396 |
paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
397 |
movq mm3, [tab_i_17_xmm] ; 3 ; w05 w04 w01 w00 |
movq mm3, [tab_i_17_xmm] ; 3 ; w05 w04 w01 w00 |
398 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
399 |
paddd mm0, mm2 ; 0 free a3+b3 a2+b2 |
paddd mm0, mm2 ; 0 free a3+b3 a2+b2 |
400 |
pshufw mm2, [eax+16],11011101b; x3 x1 x3 x1 |
pshufw mm2, [_ECX+16],11011101b; x3 x1 x3 x1 |
401 |
pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 |
pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 |
402 |
pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
403 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
406 |
psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
407 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
408 |
packssdw mm6, mm0 ; 0 free ; y3 y2 y1 y0 |
packssdw mm6, mm0 ; 0 free ; y3 y2 y1 y0 |
409 |
pshufw mm0, [eax+16+8],11011101b ; x7 x5 x7 x5 |
pshufw mm0, [_ECX+16+8],11011101b ; x7 x5 x7 x5 |
410 |
movq [eax+0], mm6 ; 3 ; save y3 y2 y1 y0 stall2 |
movq [_ECX+0], mm6 ; 3 ; save y3 y2 y1 y0 stall2 |
411 |
|
|
412 |
; DCT_8_INV_ROW_1_s [eax+16], 16, tab_i_17_xmm, rounder_1 |
; DCT_8_INV_ROW_1_s [_ECX+16], 16, tab_i_17_xmm, rounder_1 |
413 |
movq mm4, [tab_i_17_xmm+8] ; 4 ; w07 w06 w03 w02 |
movq mm4, [tab_i_17_xmm+8] ; 4 ; w07 w06 w03 w02 |
414 |
movq mm6, [tab_i_17_xmm+32] ; 6 ; w21 w20 w17 w16 |
movq mm6, [tab_i_17_xmm+32] ; 6 ; w21 w20 w17 w16 |
415 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 |
416 |
paddd mm3, [rounder_1] ; +rounder stall 6 |
paddd mm3, [rounder_1] ; +rounder stall 6 |
417 |
paddd mm5, [rounder_1] ; +rounder |
paddd mm5, [rounder_1] ; +rounder |
418 |
movq [eax+0+8], mm7 ; 7 ; save y7 y6 y5 y4 |
movq [_ECX+0+8], mm7 ; 7 ; save y7 y6 y5 y4 |
419 |
movq mm7, [tab_i_17_xmm+40] ; 7 ; w23 w22 w19 w18 |
movq mm7, [tab_i_17_xmm+40] ; 7 ; w23 w22 w19 w18 |
420 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
421 |
pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
425 |
pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
426 |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
427 |
paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
428 |
pshufw mm1, [eax+32+8],10001000b ; x6 x4 x6 x4 |
pshufw mm1, [_ECX+32+8],10001000b ; x6 x4 x6 x4 |
429 |
movq mm4, mm3 ; 4 ; a1 a0 |
movq mm4, mm3 ; 4 ; a1 a0 |
430 |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
431 |
paddd mm2, mm0 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
paddd mm2, mm0 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
432 |
pshufw mm0, [eax+32],10001000b; x2 x0 x2 x0 |
pshufw mm0, [_ECX+32],10001000b; x2 x0 x2 x0 |
433 |
movq mm7, mm5 ; 7 ; a3 a2 |
movq mm7, mm5 ; 7 ; a3 a2 |
434 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 |
435 |
paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
436 |
movq mm3, [tab_i_26_xmm] ; 3 ; w05 w04 w01 w00 |
movq mm3, [tab_i_26_xmm] ; 3 ; w05 w04 w01 w00 |
437 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
438 |
paddd mm5, mm2 ; 0 free a3+b3 a2+b2 |
paddd mm5, mm2 ; 0 free a3+b3 a2+b2 |
439 |
pshufw mm2, [eax+32],11011101b; x3 x1 x3 x1 |
pshufw mm2, [_ECX+32],11011101b; x3 x1 x3 x1 |
440 |
pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 |
pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 |
441 |
pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
442 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
445 |
psrad mm5, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
psrad mm5, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
446 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
447 |
packssdw mm6, mm5 ; 0 free ; y3 y2 y1 y0 |
packssdw mm6, mm5 ; 0 free ; y3 y2 y1 y0 |
448 |
pshufw mm5, [eax+32+8],11011101b ; x7 x5 x7 x5 |
pshufw mm5, [_ECX+32+8],11011101b ; x7 x5 x7 x5 |
449 |
movq [eax+16], mm6 ; 3 ; save y3 y2 y1 y0 |
movq [_ECX+16], mm6 ; 3 ; save y3 y2 y1 y0 |
450 |
|
|
451 |
; DCT_8_INV_ROW_1_s [eax+32], 32, tab_i_26_xmm, rounder_2 |
; DCT_8_INV_ROW_1_s [_ECX+32], 32, tab_i_26_xmm, rounder_2 |
452 |
movq mm4, [tab_i_26_xmm+8] ; 4 ; w07 w06 w03 w02 |
movq mm4, [tab_i_26_xmm+8] ; 4 ; w07 w06 w03 w02 |
453 |
movq mm6, [tab_i_26_xmm+32] ; 6 ; w21 w20 w17 w16 |
movq mm6, [tab_i_26_xmm+32] ; 6 ; w21 w20 w17 w16 |
454 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 |
455 |
paddd mm3, [rounder_2] ; +rounder |
paddd mm3, [rounder_2] ; +rounder |
456 |
paddd mm0, [rounder_2] ; +rounder |
paddd mm0, [rounder_2] ; +rounder |
457 |
movq [eax+16+8], mm7 ; 7 ; save y7 y6 |
movq [_ECX+16+8], mm7 ; 7 ; save y7 y6 |
458 |
movq mm7, [tab_i_26_xmm+40] ; 7 ; w23 w22 w19 w18 |
movq mm7, [tab_i_26_xmm+40] ; 7 ; w23 w22 w19 w18 |
459 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
460 |
pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
464 |
pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 |
465 |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) |
466 |
paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) |
467 |
pshufw mm1, [eax+48+8],10001000b ; x6 x4 x6 x4 |
pshufw mm1, [_ECX+48+8],10001000b ; x6 x4 x6 x4 |
468 |
movq mm4, mm3 ; 4 ; a1 a0 |
movq mm4, mm3 ; 4 ; a1 a0 |
469 |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) |
470 |
paddd mm2, mm5 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
paddd mm2, mm5 ; 5 free ; b3=sum(odd3) b2=sum(odd2) |
471 |
pshufw mm5, [eax+48],10001000b; x2 x0 x2 x0 mm5 & mm0 exchanged for next cycle |
pshufw mm5, [_ECX+48],10001000b; x2 x0 x2 x0 mm5 & mm0 exchanged for next cycle |
472 |
movq mm7, mm0 ; 7 ; a3 a2 |
movq mm7, mm0 ; 7 ; a3 a2 |
473 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 |
psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 |
474 |
paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
475 |
movq mm3, [tab_i_35_xmm] ; 3 ; w05 w04 w01 w00 |
movq mm3, [tab_i_35_xmm] ; 3 ; w05 w04 w01 w00 |
476 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
477 |
paddd mm0, mm2 ; 0 free a3+b3 a2+b2 |
paddd mm0, mm2 ; 0 free a3+b3 a2+b2 |
478 |
pshufw mm2, [eax+48],11011101b; x3 x1 x3 x1 |
pshufw mm2, [_ECX+48],11011101b; x3 x1 x3 x1 |
479 |
pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 |
pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 |
480 |
pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 |
481 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
484 |
psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
485 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
486 |
packssdw mm6, mm0 ; 0 free ; y3 y2 y1 y0 |
packssdw mm6, mm0 ; 0 free ; y3 y2 y1 y0 |
487 |
pshufw mm0, [eax+48+8],11011101b ; x7 x5 x7 x5 |
pshufw mm0, [_ECX+48+8],11011101b ; x7 x5 x7 x5 |
488 |
movq [eax+32], mm6 ; 3 ; save y3 y2 y1 y0 stall2 |
movq [_ECX+32], mm6 ; 3 ; save y3 y2 y1 y0 stall2 |
489 |
|
|
490 |
; DCT_8_INV_ROW_1_s [eax+48], [eax+48], tab_i_35_xmm, rounder_3 |
; DCT_8_INV_ROW_1_s [_ECX+48], [_ECX+48], tab_i_35_xmm, rounder_3 |
491 |
movq mm4, [tab_i_35_xmm+8] ; 4 ; w07 w06 w03 w02 |
movq mm4, [tab_i_35_xmm+8] ; 4 ; w07 w06 w03 w02 |
492 |
movq mm6, [tab_i_35_xmm+32] ; 6 ; w21 w20 w17 w16 |
movq mm6, [tab_i_35_xmm+32] ; 6 ; w21 w20 w17 w16 |
493 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 |
pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 |
494 |
paddd mm3, [rounder_3] ; +rounder stall 6 |
paddd mm3, [rounder_3] ; +rounder stall 6 |
495 |
paddd mm5, [rounder_3] ; +rounder |
paddd mm5, [rounder_3] ; +rounder |
496 |
movq [eax+32+8], mm7 ; 7 ; save y7 y6 y5 y4 |
movq [_ECX+32+8], mm7 ; 7 ; save y7 y6 y5 y4 |
497 |
movq mm7, [tab_i_35_xmm+40] ; 7 ; w23 w22 w19 w18 |
movq mm7, [tab_i_35_xmm+40] ; 7 ; w23 w22 w19 w18 |
498 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 |
499 |
pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 |
513 |
paddd mm3, mm6 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
paddd mm3, mm6 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 |
514 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
psubd mm7, mm2 ; ; a3-b3 a2-b2 |
515 |
paddd mm2, mm5 ; 0 free a3+b3 a2+b2 |
paddd mm2, mm5 ; 0 free a3+b3 a2+b2 |
516 |
movq mm5, [eax+16*5] |
movq mm5, [_ECX+16*5] |
517 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 |
518 |
psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 |
psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 |
519 |
psrad mm3, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 |
psrad mm3, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 |
520 |
psrad mm2, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
psrad mm2, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 |
521 |
movq mm6, [eax+16*1] |
movq mm6, [_ECX+16*1] |
522 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 |
523 |
movq mm4, [tg_1_16] |
movq mm4, [tg_1_16] |
524 |
packssdw mm3, mm2 ; 0 free ; y3 y2 y1 y0 |
packssdw mm3, mm2 ; 0 free ; y3 y2 y1 y0 |
525 |
pshufw mm2, mm7, 10110001b ; y7 y6 y5 y4 |
pshufw mm2, mm7, 10110001b ; y7 y6 y5 y4 |
526 |
|
|
527 |
; DCT_8_INV_COL_4 [eax+0],[eax+0] |
; DCT_8_INV_COL_4 [_ECX+0],[_ECX+0] |
528 |
; movq mm3,mmword ptr [eax+16*3] |
; movq mm3,mmword ptr [_ECX+16*3] |
529 |
movq mm7, [eax+16*7] |
movq mm7, [_ECX+16*7] |
530 |
pmulhw mm0, mm3 ; x3*(tg_3_16-1) |
pmulhw mm0, mm3 ; x3*(tg_3_16-1) |
531 |
pmulhw mm1, mm5 ; x5*(tg_3_16-1) |
pmulhw mm1, mm5 ; x5*(tg_3_16-1) |
532 |
movq [eax+48+8], mm2 ; 7 ; save y7 y6 y5 y4 |
movq [_ECX+48+8], mm2 ; 7 ; save y7 y6 y5 y4 |
533 |
movq mm2, mm4 ; tg_1_16 |
movq mm2, mm4 ; tg_1_16 |
534 |
pmulhw mm4, mm7 ; x7*tg_1_16 |
pmulhw mm4, mm7 ; x7*tg_1_16 |
535 |
paddsw mm0, mm3 ; x3*tg_3_16 |
paddsw mm0, mm3 ; x3*tg_3_16 |
536 |
pmulhw mm2, mm6 ; x1*tg_1_16 |
pmulhw mm2, mm6 ; x1*tg_1_16 |
537 |
paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) |
paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) |
538 |
psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 |
psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 |
539 |
movq [eax+48], mm3 ; 3 ; save y3 y2 y1 y0 |
movq [_ECX+48], mm3 ; 3 ; save y3 y2 y1 y0 |
540 |
movq mm3, [ocos_4_16] |
movq mm3, [ocos_4_16] |
541 |
paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 |
paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 |
542 |
paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 |
paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 |
549 |
paddsw mm2, mm0 ; tm17+tm35 = t2 |
paddsw mm2, mm0 ; tm17+tm35 = t2 |
550 |
movq mm7, [tg_2_16] |
movq mm7, [tg_2_16] |
551 |
movq mm1, mm4 ; t1 |
movq mm1, mm4 ; t1 |
552 |
movq [eax+3*16], mm5 ; save b0 |
movq [_ECX+3*16], mm5 ; save b0 |
553 |
paddsw mm1, mm2 ; t1+t2 |
paddsw mm1, mm2 ; t1+t2 |
554 |
movq [eax+5*16], mm6 ; save b3 |
movq [_ECX+5*16], mm6 ; save b3 |
555 |
psubsw mm4, mm2 ; t1-t2 |
psubsw mm4, mm2 ; t1-t2 |
556 |
movq mm5, [eax+2*16] |
movq mm5, [_ECX+2*16] |
557 |
movq mm0, mm7 ; tg_2_16 |
movq mm0, mm7 ; tg_2_16 |
558 |
movq mm6, [eax+6*16] |
movq mm6, [_ECX+6*16] |
559 |
pmulhw mm0, mm5 ; x2*tg_2_16 |
pmulhw mm0, mm5 ; x2*tg_2_16 |
560 |
pmulhw mm7, mm6 ; x6*tg_2_16 |
pmulhw mm7, mm6 ; x6*tg_2_16 |
561 |
; slot |
; slot |
562 |
pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2 |
pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2 |
563 |
; slot |
; slot |
564 |
movq mm2, [eax+0*16] |
movq mm2, [_ECX+0*16] |
565 |
pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2 |
pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2 |
566 |
psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26 |
psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26 |
567 |
movq mm3, [eax+0*16] ; x0 |
movq mm3, [_ECX+0*16] ; x0 |
568 |
movq mm6, [eax+4*16] |
movq mm6, [_ECX+4*16] |
569 |
paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26 |
paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26 |
570 |
paddsw mm2, mm6 ; x0+x4 = tp04 |
paddsw mm2, mm6 ; x0+x4 = tp04 |
571 |
psubsw mm3, mm6 ; x0-x4 = tm04 |
psubsw mm3, mm6 ; x0-x4 = tm04 |
585 |
psubsw mm7, mm1 ; a1-b1 |
psubsw mm7, mm1 ; a1-b1 |
586 |
psraw mm6, SHIFT_INV_COL ; dst2 |
psraw mm6, SHIFT_INV_COL ; dst2 |
587 |
psubsw mm0, mm4 ; a2-b2 |
psubsw mm0, mm4 ; a2-b2 |
588 |
movq mm1, [eax+3*16] ; load b0 |
movq mm1, [_ECX+3*16] ; load b0 |
589 |
psraw mm7, SHIFT_INV_COL ; dst6 |
psraw mm7, SHIFT_INV_COL ; dst6 |
590 |
movq mm4, mm5 ; a0 |
movq mm4, mm5 ; a0 |
591 |
psraw mm0, SHIFT_INV_COL ; dst5 |
psraw mm0, SHIFT_INV_COL ; dst5 |
592 |
movq [eax+1*16], mm3 |
movq [_ECX+1*16], mm3 |
593 |
paddsw mm5, mm1 ; a0+b0 |
paddsw mm5, mm1 ; a0+b0 |
594 |
movq [eax+2*16], mm6 |
movq [_ECX+2*16], mm6 |
595 |
psubsw mm4, mm1 ; a0-b0 |
psubsw mm4, mm1 ; a0-b0 |
596 |
movq mm3, [eax+5*16] ; load b3 |
movq mm3, [_ECX+5*16] ; load b3 |
597 |
psraw mm5, SHIFT_INV_COL ; dst0 |
psraw mm5, SHIFT_INV_COL ; dst0 |
598 |
movq mm6, mm2 ; a3 |
movq mm6, mm2 ; a3 |
599 |
psraw mm4, SHIFT_INV_COL ; dst7 |
psraw mm4, SHIFT_INV_COL ; dst7 |
600 |
movq [eax+5*16], mm0 |
movq [_ECX+5*16], mm0 |
601 |
movq mm0, [tg_3_16] |
movq mm0, [tg_3_16] |
602 |
paddsw mm2, mm3 ; a3+b3 |
paddsw mm2, mm3 ; a3+b3 |
603 |
movq [eax+6*16], mm7 |
movq [_ECX+6*16], mm7 |
604 |
psubsw mm6, mm3 ; a3-b3 |
psubsw mm6, mm3 ; a3-b3 |
605 |
movq mm3, [eax+8+16*3] |
movq mm3, [_ECX+8+16*3] |
606 |
movq [eax+0*16], mm5 |
movq [_ECX+0*16], mm5 |
607 |
psraw mm2, SHIFT_INV_COL ; dst3 |
psraw mm2, SHIFT_INV_COL ; dst3 |
608 |
movq [eax+7*16], mm4 |
movq [_ECX+7*16], mm4 |
609 |
|
|
610 |
; DCT_8_INV_COL_4 [eax+8],[eax+8] |
; DCT_8_INV_COL_4 [_ECX+8],[_ECX+8] |
611 |
movq mm1, mm0 ; tg_3_16 |
movq mm1, mm0 ; tg_3_16 |
612 |
movq mm5, [eax+8+16*5] |
movq mm5, [_ECX+8+16*5] |
613 |
psraw mm6, SHIFT_INV_COL ; dst4 |
psraw mm6, SHIFT_INV_COL ; dst4 |
614 |
pmulhw mm0, mm3 ; x3*(tg_3_16-1) |
pmulhw mm0, mm3 ; x3*(tg_3_16-1) |
615 |
movq mm4, [tg_1_16] |
movq mm4, [tg_1_16] |
616 |
pmulhw mm1, mm5 ; x5*(tg_3_16-1) |
pmulhw mm1, mm5 ; x5*(tg_3_16-1) |
617 |
movq mm7, [eax+8+16*7] |
movq mm7, [_ECX+8+16*7] |
618 |
movq [eax+3*16], mm2 |
movq [_ECX+3*16], mm2 |
619 |
movq mm2, mm4 ; tg_1_16 |
movq mm2, mm4 ; tg_1_16 |
620 |
movq [eax+4*16], mm6 |
movq [_ECX+4*16], mm6 |
621 |
movq mm6, [eax+8+16*1] |
movq mm6, [_ECX+8+16*1] |
622 |
pmulhw mm4, mm7 ; x7*tg_1_16 |
pmulhw mm4, mm7 ; x7*tg_1_16 |
623 |
paddsw mm0, mm3 ; x3*tg_3_16 |
paddsw mm0, mm3 ; x3*tg_3_16 |
624 |
pmulhw mm2, mm6 ; x1*tg_1_16 |
pmulhw mm2, mm6 ; x1*tg_1_16 |
636 |
movq mm7, [tg_2_16] |
movq mm7, [tg_2_16] |
637 |
movq mm1, mm4 ; t1 |
movq mm1, mm4 ; t1 |
638 |
psubsw mm6, mm0 ; tm17-tm35 = b3 |
psubsw mm6, mm0 ; tm17-tm35 = b3 |
639 |
movq [eax+8+3*16], mm5 ; save b0 |
movq [_ECX+8+3*16], mm5 ; save b0 |
640 |
movq [eax+8+5*16], mm6 ; save b3 |
movq [_ECX+8+5*16], mm6 ; save b3 |
641 |
psubsw mm4, mm2 ; t1-t2 |
psubsw mm4, mm2 ; t1-t2 |
642 |
movq mm5, [eax+8+2*16] |
movq mm5, [_ECX+8+2*16] |
643 |
movq mm0, mm7 ; tg_2_16 |
movq mm0, mm7 ; tg_2_16 |
644 |
movq mm6, [eax+8+6*16] |
movq mm6, [_ECX+8+6*16] |
645 |
paddsw mm1, mm2 ; t1+t2 |
paddsw mm1, mm2 ; t1+t2 |
646 |
pmulhw mm0, mm5 ; x2*tg_2_16 |
pmulhw mm0, mm5 ; x2*tg_2_16 |
647 |
pmulhw mm7, mm6 ; x6*tg_2_16 |
pmulhw mm7, mm6 ; x6*tg_2_16 |
648 |
movq mm2, [eax+8+0*16] |
movq mm2, [_ECX+8+0*16] |
649 |
pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2 |
pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2 |
650 |
psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26 |
psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26 |
651 |
; slot |
; slot |
652 |
pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2 |
pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2 |
653 |
; slot |
; slot |
654 |
movq mm3, [eax+8+0*16] ; x0 |
movq mm3, [_ECX+8+0*16] ; x0 |
655 |
movq mm6, [eax+8+4*16] |
movq mm6, [_ECX+8+4*16] |
656 |
paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26 |
paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26 |
657 |
paddsw mm2, mm6 ; x0+x4 = tp04 |
paddsw mm2, mm6 ; x0+x4 = tp04 |
658 |
psubsw mm3, mm6 ; x0-x4 = tm04 |
psubsw mm3, mm6 ; x0-x4 = tm04 |
672 |
psubsw mm7, mm1 ; a1-b1 |
psubsw mm7, mm1 ; a1-b1 |
673 |
psraw mm6, SHIFT_INV_COL ; dst2 |
psraw mm6, SHIFT_INV_COL ; dst2 |
674 |
psubsw mm0, mm4 ; a2-b2 |
psubsw mm0, mm4 ; a2-b2 |
675 |
movq mm1, [eax+8+3*16] ; load b0 |
movq mm1, [_ECX+8+3*16] ; load b0 |
676 |
psraw mm7, SHIFT_INV_COL ; dst6 |
psraw mm7, SHIFT_INV_COL ; dst6 |
677 |
movq mm4, mm5 ; a0 |
movq mm4, mm5 ; a0 |
678 |
psraw mm0, SHIFT_INV_COL ; dst5 |
psraw mm0, SHIFT_INV_COL ; dst5 |
679 |
movq [eax+8+1*16], mm3 |
movq [_ECX+8+1*16], mm3 |
680 |
paddsw mm5, mm1 ; a0+b0 |
paddsw mm5, mm1 ; a0+b0 |
681 |
movq [eax+8+2*16], mm6 |
movq [_ECX+8+2*16], mm6 |
682 |
psubsw mm4, mm1 ; a0-b0 |
psubsw mm4, mm1 ; a0-b0 |
683 |
movq mm3, [eax+8+5*16] ; load b3 |
movq mm3, [_ECX+8+5*16] ; load b3 |
684 |
psraw mm5, SHIFT_INV_COL ; dst0 |
psraw mm5, SHIFT_INV_COL ; dst0 |
685 |
movq mm6, mm2 ; a3 |
movq mm6, mm2 ; a3 |
686 |
psraw mm4, SHIFT_INV_COL ; dst7 |
psraw mm4, SHIFT_INV_COL ; dst7 |
687 |
movq [eax+8+5*16], mm0 |
movq [_ECX+8+5*16], mm0 |
688 |
paddsw mm2, mm3 ; a3+b3 |
paddsw mm2, mm3 ; a3+b3 |
689 |
movq [eax+8+6*16], mm7 |
movq [_ECX+8+6*16], mm7 |
690 |
psubsw mm6, mm3 ; a3-b3 |
psubsw mm6, mm3 ; a3-b3 |
691 |
movq [eax+8+0*16], mm5 |
movq [_ECX+8+0*16], mm5 |
692 |
psraw mm2, SHIFT_INV_COL ; dst3 |
psraw mm2, SHIFT_INV_COL ; dst3 |
693 |
movq [eax+8+7*16], mm4 |
movq [_ECX+8+7*16], mm4 |
694 |
psraw mm6, SHIFT_INV_COL ; dst4 |
psraw mm6, SHIFT_INV_COL ; dst4 |
695 |
movq [eax+8+3*16], mm2 |
movq [_ECX+8+3*16], mm2 |
696 |
movq [eax+8+4*16], mm6 |
movq [_ECX+8+4*16], mm6 |
697 |
|
|
698 |
ret |
ret |
699 |
|
ENDFUNC |
700 |
|
|
701 |
|
|
702 |
|
%ifidn __OUTPUT_FORMAT__,elf |
703 |
|
section ".note.GNU-stack" noalloc noexec nowrite progbits |
704 |
|
%endif |
705 |
|
|