58 |
%endif |
%endif |
59 |
%endmacro |
%endmacro |
60 |
|
|
61 |
plus_one times 4 dw 1 |
align 16 |
62 |
|
|
63 |
|
plus_one times 8 dw 1 |
64 |
|
|
65 |
;=========================================================================== |
;=========================================================================== |
66 |
; |
; |
72 |
times 4 dw %1 / 2 |
times 4 dw %1 / 2 |
73 |
%endmacro |
%endmacro |
74 |
|
|
75 |
align ALIGN |
align 16 |
76 |
mmx_sub |
mmx_sub |
77 |
MMX_SUB 1 |
MMX_SUB 1 |
78 |
MMX_SUB 2 |
MMX_SUB 2 |
122 |
times 4 dw (1 << 16) / (%1 * 2) + 1 |
times 4 dw (1 << 16) / (%1 * 2) + 1 |
123 |
%endmacro |
%endmacro |
124 |
|
|
125 |
align ALIGN |
align 16 |
126 |
mmx_div |
mmx_div |
127 |
MMX_DIV 1 |
MMX_DIV 1 |
128 |
MMX_DIV 2 |
MMX_DIV 2 |
172 |
%endif |
%endif |
173 |
%endmacro |
%endmacro |
174 |
|
|
175 |
align ALIGN |
align 16 |
176 |
mmx_add |
mmx_add |
177 |
MMX_ADD 1 |
MMX_ADD 1 |
178 |
MMX_ADD 2 |
MMX_ADD 2 |
217 |
times 4 dw %1 * 2 |
times 4 dw %1 * 2 |
218 |
%endmacro |
%endmacro |
219 |
|
|
220 |
align ALIGN |
align 16 |
221 |
mmx_mul |
mmx_mul |
222 |
MMX_MUL 1 |
MMX_MUL 1 |
223 |
MMX_MUL 2 |
MMX_MUL 2 |
262 |
mmx_32768_minus_2048 times 4 dw (32768-2048) |
mmx_32768_minus_2048 times 4 dw (32768-2048) |
263 |
mmx_32767_minus_2047 times 4 dw (32767-2047) |
mmx_32767_minus_2047 times 4 dw (32767-2047) |
264 |
|
|
265 |
|
align 16 |
266 |
|
sse2_pos_2047 times 8 dw 2047 |
267 |
|
sse2_neg_2048 times 8 dw -2048 |
268 |
|
|
269 |
|
|
270 |
section .text |
section .text |
271 |
|
|
482 |
|
|
483 |
;=========================================================================== |
;=========================================================================== |
484 |
; |
; |
485 |
|
; uint32_t quant_inter_sse2(int16_t * coeff, |
486 |
|
; const int16_t const * data, |
487 |
|
; const uint32_t quant); |
488 |
|
; |
489 |
|
;=========================================================================== |
490 |
|
|
491 |
|
align 16 |
492 |
|
cglobal quant_inter_sse2 |
493 |
|
quant_inter_sse2 |
494 |
|
|
495 |
|
push esi |
496 |
|
push edi |
497 |
|
|
498 |
|
mov edi, [esp + 8 + 4] ; coeff |
499 |
|
mov esi, [esp + 8 + 8] ; data |
500 |
|
mov eax, [esp + 8 + 12] ; quant |
501 |
|
|
502 |
|
xor ecx, ecx |
503 |
|
|
504 |
|
pxor xmm5, xmm5 ; sum |
505 |
|
|
506 |
|
movq mm0, [mmx_sub + eax*8 - 8] ; sub |
507 |
|
movq2dq xmm6, mm0 ; load into low 8 bytes |
508 |
|
movlhps xmm6, xmm6 ; duplicate into high 8 bytes |
509 |
|
|
510 |
|
cmp al, 1 |
511 |
|
jnz .not1 |
512 |
|
jmp .q1loop |
513 |
|
|
514 |
|
.not1 |
515 |
|
movq mm0, [mmx_div + eax*8 - 8] ; divider |
516 |
|
movq2dq xmm7, mm0 |
517 |
|
movlhps xmm7, xmm7 |
518 |
|
|
519 |
|
align 16 |
520 |
|
.loop |
521 |
|
movdqa xmm0, [esi + ecx*8] ; xmm0 = [1st] |
522 |
|
movdqa xmm3, [esi + ecx*8 + 16] ; xmm3 = [2nd] |
523 |
|
pxor xmm1, xmm1 ; xmm1 = 0 |
524 |
|
pxor xmm4, xmm4 |
525 |
|
pcmpgtw xmm1, xmm0 ; xmm1 = (0 > xmm0) |
526 |
|
pcmpgtw xmm4, xmm3 |
527 |
|
pxor xmm0, xmm1 ; xmm0 = |xmm0| |
528 |
|
pxor xmm3, xmm4 |
529 |
|
psubw xmm0, xmm1 ; displace |
530 |
|
psubw xmm3, xmm4 |
531 |
|
psubusw xmm0, xmm6 ; xmm0 -= sub (unsigned, dont go < 0) |
532 |
|
psubusw xmm3, xmm6 |
533 |
|
pmulhw xmm0, xmm7 ; xmm0 = (xmm0 / 2Q) >> 16 |
534 |
|
pmulhw xmm3, xmm7 |
535 |
|
paddw xmm5, xmm0 ; sum += xmm0 |
536 |
|
pxor xmm0, xmm1 ; xmm0 *= sign(xmm0) |
537 |
|
paddw xmm5, xmm3 |
538 |
|
pxor xmm3, xmm4 |
539 |
|
psubw xmm0, xmm1 ; undisplace |
540 |
|
psubw xmm3, xmm4 |
541 |
|
movdqa [edi + ecx*8], xmm0 |
542 |
|
movdqa [edi + ecx*8 + 16], xmm3 |
543 |
|
|
544 |
|
add ecx, 4 |
545 |
|
cmp ecx, 16 |
546 |
|
jnz .loop |
547 |
|
|
548 |
|
.done |
549 |
|
pmaddwd xmm5, [plus_one] |
550 |
|
movhlps xmm6, xmm5 |
551 |
|
paddd xmm5, xmm6 |
552 |
|
movdq2q mm0, xmm5 |
553 |
|
|
554 |
|
movq mm5, mm0 |
555 |
|
psrlq mm5, 32 |
556 |
|
paddd mm0, mm5 |
557 |
|
movd eax, mm0 ; return sum |
558 |
|
|
559 |
|
pop edi |
560 |
|
pop esi |
561 |
|
|
562 |
|
ret |
563 |
|
|
564 |
|
align 16 |
565 |
|
.q1loop |
566 |
|
movq mm0, [esi + 8*ecx] ; mm0 = [1st] |
567 |
|
movq mm3, [esi + 8*ecx+ 8] ; |
568 |
|
pxor mm1, mm1 ; mm1 = 0 |
569 |
|
pxor mm4, mm4 ; |
570 |
|
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) |
571 |
|
pcmpgtw mm4, mm3 ; |
572 |
|
pxor mm0, mm1 ; mm0 = |mm0| |
573 |
|
pxor mm3, mm4 ; |
574 |
|
psubw mm0, mm1 ; displace |
575 |
|
psubw mm3, mm4 ; |
576 |
|
psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0) |
577 |
|
psubusw mm3, mm6 ; |
578 |
|
psrlw mm0, 1 ; mm0 >>= 1 (/2) |
579 |
|
psrlw mm3, 1 ; |
580 |
|
paddw mm5, mm0 ; sum += mm0 |
581 |
|
pxor mm0, mm1 ; mm0 *= sign(mm0) |
582 |
|
paddw mm5, mm3 ; |
583 |
|
pxor mm3, mm4 ; |
584 |
|
psubw mm0, mm1 ; undisplace |
585 |
|
psubw mm3, mm4 |
586 |
|
movq [edi + 8*ecx], mm0 |
587 |
|
movq [edi + 8*ecx + 8], mm3 |
588 |
|
|
589 |
|
add ecx,2 |
590 |
|
cmp ecx,16 |
591 |
|
jnz .q1loop |
592 |
|
|
593 |
|
jmp .done |
594 |
|
|
595 |
|
|
596 |
|
|
597 |
|
;=========================================================================== |
598 |
|
; |
599 |
; void dequant_intra_mmx(int16_t *data, |
; void dequant_intra_mmx(int16_t *data, |
600 |
; const int16_t const *coeff, |
; const int16_t const *coeff, |
601 |
; const uint32_t quant, |
; const uint32_t quant, |
770 |
pop esi |
pop esi |
771 |
|
|
772 |
ret |
ret |
773 |
|
|
774 |
|
|
775 |
|
;=========================================================================== |
776 |
|
; |
777 |
|
; void dequant_inter_sse2(int16_t * data, |
778 |
|
; const int16_t * const coeff, |
779 |
|
; const uint32_t quant); |
780 |
|
; |
781 |
|
;=========================================================================== |
782 |
|
|
783 |
|
align 16 |
784 |
|
cglobal dequant_inter_sse2 |
785 |
|
dequant_inter_sse2 |
786 |
|
|
787 |
|
push esi |
788 |
|
push edi |
789 |
|
|
790 |
|
mov edi, [esp + 8 + 4] ; data |
791 |
|
mov esi, [esp + 8 + 8] ; coeff |
792 |
|
mov eax, [esp + 8 + 12] ; quant |
793 |
|
movq mm6, [mmx_add + eax * 8 - 8] |
794 |
|
movq mm7, [mmx_mul + eax * 8 - 8] |
795 |
|
|
796 |
|
movq2dq xmm6, mm6 |
797 |
|
movq2dq xmm7, mm7 |
798 |
|
movlhps xmm6, xmm6 |
799 |
|
movlhps xmm7, xmm7 |
800 |
|
|
801 |
|
xor eax, eax |
802 |
|
|
803 |
|
align 16 |
804 |
|
.loop |
805 |
|
movdqa xmm0, [esi + eax*8] ; xmm0 = [coeff] |
806 |
|
movdqa xmm3, [esi + eax*8 + 16] |
807 |
|
pxor xmm1, xmm1 |
808 |
|
pxor xmm4, xmm4 |
809 |
|
pcmpgtw xmm1, xmm0 |
810 |
|
pcmpgtw xmm4, xmm3 |
811 |
|
pxor xmm2, xmm2 |
812 |
|
pxor xmm5, xmm5 |
813 |
|
pcmpeqw xmm2, xmm0 |
814 |
|
pcmpeqw xmm5, xmm3 |
815 |
|
pandn xmm2, xmm6 |
816 |
|
pandn xmm5, xmm6 |
817 |
|
pxor xmm0, xmm1 |
818 |
|
pxor xmm3, xmm4 |
819 |
|
psubw xmm0, xmm1 |
820 |
|
psubw xmm3, xmm4 |
821 |
|
pmullw xmm0, xmm7 |
822 |
|
pmullw xmm3, xmm7 |
823 |
|
paddw xmm0, xmm2 |
824 |
|
paddw xmm3, xmm5 |
825 |
|
pxor xmm0, xmm1 |
826 |
|
pxor xmm3, xmm4 |
827 |
|
psubw xmm0, xmm1 |
828 |
|
psubw xmm3, xmm4 |
829 |
|
|
830 |
|
%ifdef SATURATE |
831 |
|
movdqa xmm2, [sse2_pos_2047] |
832 |
|
movdqa xmm4, [sse2_neg_2048] |
833 |
|
pminsw xmm0, xmm2 |
834 |
|
pminsw xmm3, xmm2 |
835 |
|
pmaxsw xmm0, xmm4 |
836 |
|
pmaxsw xmm3, xmm4 |
837 |
|
%endif |
838 |
|
|
839 |
|
movdqa [edi + eax*8], xmm0 |
840 |
|
movdqa [edi + eax*8 + 16], xmm3 |
841 |
|
|
842 |
|
add eax, 4 |
843 |
|
cmp eax, 16 |
844 |
|
jnz near .loop |
845 |
|
|
846 |
|
pop edi |
847 |
|
pop esi |
848 |
|
|
849 |
|
ret |