74 |
; |
; |
75 |
; * Some more details at: http://skal.planet-d.net/coding/dct.html |
; * Some more details at: http://skal.planet-d.net/coding/dct.html |
76 |
; |
; |
|
; |
|
|
;////////////////////////////////////////////////////////////////////// |
|
|
; |
|
|
; == Mean square errors == |
|
|
; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.000 [0.001] |
|
|
; 0.035 0.029 0.032 0.032 0.031 0.032 0.034 0.035 [0.032] |
|
|
; 0.026 0.028 0.027 0.027 0.025 0.028 0.028 0.025 [0.027] |
|
|
; 0.037 0.032 0.031 0.030 0.028 0.029 0.026 0.031 [0.030] |
|
|
; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.001 [0.001] |
|
|
; 0.025 0.024 0.022 0.022 0.022 0.022 0.023 0.023 [0.023] |
|
|
; 0.026 0.028 0.025 0.028 0.030 0.025 0.026 0.027 [0.027] |
|
|
; 0.021 0.020 0.020 0.022 0.020 0.022 0.017 0.019 [0.020] |
|
|
; |
|
|
; == Abs Mean errors == |
|
|
; 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 [0.000] |
|
|
; 0.020 0.001 0.003 0.003 0.000 0.004 0.002 0.003 [0.002] |
|
|
; 0.000 0.001 0.001 0.001 0.001 0.004 0.000 0.000 [0.000] |
|
|
; 0.027 0.001 0.000 0.002 0.002 0.002 0.001 0.000 [0.003] |
|
|
; 0.000 0.000 0.000 0.000 0.000 0.001 0.000 0.001 [-0.000] |
|
|
; 0.001 0.003 0.001 0.001 0.002 0.001 0.000 0.000 [-0.000] |
|
|
; 0.000 0.002 0.002 0.001 0.001 0.002 0.001 0.000 [-0.000] |
|
|
; 0.000 0.002 0.001 0.002 0.001 0.002 0.001 0.001 [-0.000] |
|
|
; |
|
|
; ========================= |
|
|
; Peak error: 1.0000 |
|
|
; Peak MSE: 0.0365 |
|
|
; Overall MSE: 0.0201 |
|
|
; Peak ME: 0.0265 |
|
|
; Overall ME: 0.0006 |
|
|
; |
|
77 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
78 |
; |
; |
79 |
; -=IDCT=- |
; -=IDCT=- |
82 |
; descaling) require some unpairable shifting and packing, all on |
; descaling) require some unpairable shifting and packing, all on |
83 |
; the same CPU unit. |
; the same CPU unit. |
84 |
; |
; |
|
; THIS IDCT IS NOT IEEE-COMPLIANT: IT WILL FAIL THE [-300,300] |
|
|
; INPUT RANGE TEST (because of overflow). But the [-256,255] one |
|
|
; is OK, and I'm fine with it (for now;) |
|
|
; |
|
|
; == Mean square errors == |
|
|
; 0.007 0.006 0.005 0.007 0.006 0.007 0.005 0.007 [0.006] |
|
|
; 0.006 0.008 0.007 0.007 0.007 0.008 0.008 0.008 [0.007] |
|
|
; 0.008 0.008 0.008 0.008 0.007 0.009 0.010 0.007 [0.008] |
|
|
; 0.007 0.007 0.006 0.007 0.008 0.007 0.006 0.008 [0.007] |
|
|
; 0.007 0.006 0.006 0.006 0.006 0.005 0.006 0.006 [0.006] |
|
|
; 0.008 0.007 0.006 0.008 0.007 0.008 0.009 0.009 [0.008] |
|
|
; 0.008 0.006 0.010 0.008 0.008 0.008 0.007 0.007 [0.008] |
|
|
; 0.007 0.006 0.006 0.007 0.007 0.006 0.006 0.007 [0.006] |
|
|
; |
|
|
; == Abs Mean errors == |
|
|
; 0.001 0.000 0.000 0.001 0.001 0.000 0.000 0.000 [0.000] |
|
|
; 0.000 0.002 0.002 0.000 0.001 0.001 0.000 0.002 [0.000] |
|
|
; 0.001 0.002 0.001 0.001 0.001 0.001 0.000 0.001 [-0.001] |
|
|
; 0.000 0.002 0.000 0.000 0.001 0.000 0.000 0.001 [-0.000] |
|
|
; 0.000 0.001 0.001 0.001 0.000 0.001 0.000 0.001 [0.000] |
|
|
; 0.000 0.001 0.001 0.001 0.001 0.000 0.001 0.000 [0.000] |
|
|
; 0.001 0.001 0.002 0.001 0.001 0.002 0.001 0.001 [0.001] |
|
|
; 0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000 [0.000] |
|
|
; |
|
|
; ========================= |
|
|
; |
|
|
; Peak error: 1.0000 |
|
|
; Peak MSE: 0.0096 |
|
|
; Overall MSE: 0.0070 |
|
|
; Peak ME: 0.0024 |
|
|
; Overall ME: 0.0001 |
|
|
; |
|
85 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
86 |
|
|
87 |
;============================================================================= |
;============================================================================= |
145 |
dw 0x3b21, 0x14c3, 0x979e, 0xc4df |
dw 0x3b21, 0x14c3, 0x979e, 0xc4df |
146 |
dw 0x14c3, 0x587e, 0x587e, 0x979e |
dw 0x14c3, 0x587e, 0x587e, 0x979e |
147 |
|
|
|
; the original rounding trick is by |
|
|
; Michel Lespinasse (hi Walken!) <walken@zoy.org> |
|
|
|
|
148 |
ALIGN 16 |
ALIGN 16 |
149 |
Idct_Rnd0: dd 65535, 65535, 65535, 65535 |
Walken_Idct_Rounders: |
150 |
Idct_Rnd1: dd 3612, 3612, 3612, 3612 |
dd 65536, 65536, 65536, 65536 |
151 |
Idct_Rnd2: dd 2271, 2271, 2271, 2271 |
dd 3597, 3597, 3597, 3597 |
152 |
Idct_Rnd3: dd 1203, 1203, 1203, 1203 |
dd 2260, 2260, 2260, 2260 |
153 |
Idct_Rnd4: dd 1023, 1023, 1023, 1023 |
dd 1203, 1203, 1203, 1203 |
154 |
Idct_Rnd5: dd 102, 102, 102, 102 |
dd 0, 0, 0, 0 |
155 |
Idct_Rnd6: dd 398, 398, 398, 398 |
dd 120, 120, 120, 120 |
156 |
Idct_Rnd7: dd 469, 469, 469, 469 |
dd 512, 512, 512, 512 |
157 |
|
dd 512, 512, 512, 512 |
158 |
Idct_Sparse_Rnd0: times 4 dw (65535>>11) |
|
159 |
Idct_Sparse_Rnd1: times 4 dw ( 3612>>11) |
times 8 dw (65536>>11) |
160 |
Idct_Sparse_Rnd2: times 4 dw ( 2271>>11) |
times 8 dw ( 3597>>11) |
161 |
|
times 8 dw ( 2260>>11) |
162 |
; other rounders are zero... |
; other rounders are zero... |
163 |
|
|
164 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
220 |
SECTION .text |
SECTION .text |
221 |
|
|
222 |
cglobal idct_sse2_skal |
cglobal idct_sse2_skal |
|
cglobal idct_sse2_sparse_skal |
|
223 |
cglobal fdct_sse2_skal |
cglobal fdct_sse2_skal |
224 |
|
|
225 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
230 |
|
|
231 |
movdqa xmm0, [ecx+%1*16] ; xmm0 = [01234567] |
movdqa xmm0, [ecx+%1*16] ; xmm0 = [01234567] |
232 |
|
|
233 |
pshuflw xmm0, xmm0, 11011000b ; [0213] |
pshuflw xmm0, xmm0, 11011000b ; [02134567] ; these two shufflings could be |
234 |
pshufhw xmm0, xmm0, 11011000b ; [02134657] |
pshufhw xmm0, xmm0, 11011000b ; [02134657] ; integrated in zig-zag orders |
235 |
|
|
236 |
pshufd xmm4, xmm0, 00000000b ; [02020202] |
pshufd xmm4, xmm0, 00000000b ; [02020202] |
237 |
pshufd xmm5, xmm0, 10101010b ; [46464646] |
pshufd xmm5, xmm0, 10101010b ; [46464646] |
238 |
pshufd xmm6, xmm0, 01010101b ; [13131313] |
pshufd xmm6, xmm0, 01010101b ; [13131313] |
254 |
psrad xmm4, %4 ; => out [7654] |
psrad xmm4, %4 ; => out [7654] |
255 |
|
|
256 |
packssdw xmm6, xmm4 ; [01237654] |
packssdw xmm6, xmm4 ; [01237654] |
257 |
|
|
258 |
pshufhw xmm6, xmm6, 00011011b ; [01234567] |
pshufhw xmm6, xmm6, 00011011b ; [01234567] |
259 |
|
|
260 |
movdqa [ecx+%1*16], xmm6 |
movdqa [ecx+%1*16], xmm6 |
327 |
movdqa xmm3, [%1+0*16] ; x0 |
movdqa xmm3, [%1+0*16] ; x0 |
328 |
movdqa xmm6, [%1+4*16] ; x4 |
movdqa xmm6, [%1+4*16] ; x4 |
329 |
|
|
330 |
|
movdqa [%1 ], xmm2 ; we spill 1 reg to perform safe butterflies |
331 |
|
|
332 |
|
movdqa xmm2, xmm3 |
333 |
psubsw xmm3, xmm6 ; x0-x4 = tm04 |
psubsw xmm3, xmm6 ; x0-x4 = tm04 |
334 |
paddsw xmm6, xmm6 ; 2.x4 |
paddsw xmm6, xmm2 ; x0+x4 = tp04 |
|
paddsw xmm6, xmm3 ; x0+x4 = tp04 |
|
335 |
|
|
336 |
psubsw xmm3, xmm5 ; tm04-tm26 = a2 |
movdqa xmm2, xmm6 |
337 |
psubsw xmm6, xmm7 ; tp04-tp26 = a3 |
psubsw xmm6, xmm7 |
338 |
paddsw xmm5, xmm5 ; 2.tm26 |
paddsw xmm7, xmm2 |
339 |
paddsw xmm7, xmm7 ; 2.tp26 |
movdqa xmm2, xmm3 |
340 |
paddsw xmm5, xmm3 ; tm04+tm26 = a1 |
psubsw xmm3, xmm5 |
341 |
paddsw xmm7, xmm6 ; tp04+tp26 = a0 |
paddsw xmm5, xmm2 |
342 |
|
|
343 |
psubsw xmm5, xmm0 ; a1-b1 |
movdqa xmm2, xmm5 |
344 |
psubsw xmm3, xmm4 ; a2-b2 |
psubsw xmm5, xmm0 |
345 |
paddsw xmm0, xmm0 ; 2.b1 |
paddsw xmm0, xmm2 |
346 |
paddsw xmm4, xmm4 ; 2.b2 |
movdqa xmm2, xmm3 |
347 |
paddsw xmm0, xmm5 ; a1+b1 |
psubsw xmm3, xmm4 |
348 |
paddsw xmm4, xmm3 ; a2+b2 |
paddsw xmm4, xmm2 |
349 |
|
|
350 |
|
movdqa xmm2, [%1] |
351 |
|
|
352 |
psraw xmm5, 6 ; out6 |
psraw xmm5, 6 ; out6 |
353 |
psraw xmm3, 6 ; out5 |
psraw xmm3, 6 ; out5 |
373 |
psraw xmm2, 6 ; out3 |
psraw xmm2, 6 ; out3 |
374 |
psraw xmm6, 6 ; out4 |
psraw xmm6, 6 ; out4 |
375 |
|
|
376 |
|
; store result |
377 |
|
|
378 |
movdqa [%1+0*16], xmm1 |
movdqa [%1+0*16], xmm1 |
379 |
movdqa [%1+3*16], xmm2 |
movdqa [%1+3*16], xmm2 |
380 |
movdqa [%1+4*16], xmm6 |
movdqa [%1+4*16], xmm6 |
381 |
movdqa [%1+7*16], xmm7 |
movdqa [%1+7*16], xmm7 |
|
%endmacro |
|
382 |
|
|
383 |
;----------------------------------------------------------------------------- |
%endmacro |
|
; Function idct (the straight forward version) |
|
|
;----------------------------------------------------------------------------- |
|
|
|
|
|
ALIGN 16 |
|
|
idct_sse2_skal: |
|
|
mov ecx, [esp+4] |
|
|
iMTX_MULT 0, iTab1, Idct_Rnd0, 11 |
|
|
iMTX_MULT 1, iTab2, Idct_Rnd1, 11 |
|
|
iMTX_MULT 2, iTab3, Idct_Rnd2, 11 |
|
|
iMTX_MULT 3, iTab4, Idct_Rnd3, 11 |
|
|
iMTX_MULT 4, iTab1, Idct_Rnd4, 11 |
|
|
iMTX_MULT 5, iTab4, Idct_Rnd5, 11 |
|
|
iMTX_MULT 6, iTab3, Idct_Rnd6, 11 |
|
|
iMTX_MULT 7, iTab2, Idct_Rnd7, 11 |
|
|
iLLM_PASS ecx+0 |
|
|
ret |
|
|
.endfunc |
|
384 |
|
|
385 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
386 |
; Helper macro TEST_ROW (test a null row) |
; Helper macro TEST_ROW (test a null row) |
398 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
399 |
; Function idct (this one skips null rows) |
; Function idct (this one skips null rows) |
400 |
;----------------------------------------------------------------------------- |
;----------------------------------------------------------------------------- |
401 |
|
; IEEE1180 and Walken compatible version |
402 |
|
|
403 |
ALIGN 16 |
ALIGN 16 |
404 |
idct_sse2_sparse_skal: |
idct_sse2_skal: |
405 |
|
|
406 |
mov ecx, [esp+ 4] ; Src |
mov ecx, [esp+ 4] ; Src |
407 |
|
|
408 |
TEST_ROW ecx, .Row0_Round |
TEST_ROW ecx, .Row0_Round |
409 |
iMTX_MULT 0, iTab1, Idct_Rnd0, 11 |
iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11 |
410 |
jmp .Row1 |
jmp .Row1 |
411 |
.Row0_Round |
.Row0_Round |
412 |
movq mm0, [Idct_Sparse_Rnd0] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0] |
413 |
movq [ecx ], mm0 |
movdqa [ecx ], xmm0 |
|
movq [ecx+8], mm0 |
|
414 |
|
|
415 |
.Row1 |
.Row1 |
416 |
TEST_ROW ecx+16, .Row1_Round |
TEST_ROW ecx+16, .Row1_Round |
417 |
iMTX_MULT 1, iTab2, Idct_Rnd1, 11 |
iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11 |
418 |
jmp .Row2 |
jmp .Row2 |
419 |
.Row1_Round |
.Row1_Round |
420 |
movq mm0, [Idct_Sparse_Rnd1] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1] |
421 |
movq [ecx+16 ], mm0 |
movdqa [ecx+16 ], xmm0 |
|
movq [ecx+16+8], mm0 |
|
422 |
|
|
423 |
.Row2 |
.Row2 |
424 |
TEST_ROW ecx+32, .Row2_Round |
TEST_ROW ecx+32, .Row2_Round |
425 |
iMTX_MULT 2, iTab3, Idct_Rnd2, 11 |
iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11 |
426 |
jmp .Row3 |
jmp .Row3 |
427 |
.Row2_Round |
.Row2_Round |
428 |
movq mm0, [Idct_Sparse_Rnd2] |
movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2] |
429 |
movq [ecx+32 ], mm0 |
movdqa [ecx+32 ], xmm0 |
|
movq [ecx+32+8], mm0 |
|
430 |
|
|
431 |
.Row3 |
.Row3 |
432 |
TEST_ROW ecx+48, .Row4 |
TEST_ROW ecx+48, .Row4 |
433 |
iMTX_MULT 3, iTab4, Idct_Rnd3, 11 |
iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11 |
|
jmp .Row4 |
|
434 |
|
|
435 |
.Row4 |
.Row4 |
436 |
TEST_ROW ecx+64, .Row5 |
TEST_ROW ecx+64, .Row5 |
437 |
iMTX_MULT 4, iTab1, Idct_Rnd4, 11 |
iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11 |
|
jmp .Row5 |
|
438 |
|
|
439 |
.Row5 |
.Row5 |
440 |
TEST_ROW ecx+80, .Row6 |
TEST_ROW ecx+80, .Row6 |
441 |
iMTX_MULT 5, iTab4, Idct_Rnd5, 11 |
iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11 |
442 |
|
|
443 |
.Row6 |
.Row6 |
444 |
TEST_ROW ecx+96, .Row7 |
TEST_ROW ecx+96, .Row7 |
445 |
iMTX_MULT 6, iTab3, Idct_Rnd6, 11 |
iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11 |
446 |
|
|
447 |
.Row7 |
.Row7 |
448 |
TEST_ROW ecx+112, .End |
TEST_ROW ecx+112, .End |
449 |
iMTX_MULT 7, iTab2, Idct_Rnd7, 11 |
iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11 |
450 |
.End |
.End |
451 |
|
|
452 |
iLLM_PASS ecx+0 |
iLLM_PASS ecx |
453 |
|
|
454 |
ret |
ret |
455 |
.endfunc |
.endfunc |
456 |
|
|