24 |
|
|
25 |
BITS 32 |
BITS 32 |
26 |
|
|
27 |
%macro cglobal 1 |
%include "nasm.inc" |
|
%ifdef PREFIX |
|
|
%ifdef MARK_FUNCS |
|
|
global _%1:function %1.endfunc-%1 |
|
|
%define %1 _%1:function %1.endfunc-%1 |
|
|
%define ENDFUNC .endfunc |
|
|
%else |
|
|
global _%1 |
|
|
%define %1 _%1 |
|
|
%define ENDFUNC |
|
|
%endif |
|
|
%else |
|
|
%ifdef MARK_FUNCS |
|
|
global %1:function %1.endfunc-%1 |
|
|
%define ENDFUNC .endfunc |
|
|
%else |
|
|
global %1 |
|
|
%define ENDFUNC |
|
|
%endif |
|
|
%endif |
|
|
%endmacro |
|
28 |
|
|
29 |
%macro ACC_ROW 2 |
%macro ACC_ROW 2 |
30 |
movq %1,[ ecx] |
movq %1,[ TMP0] |
31 |
movq %2,[ecx+edx] |
movq %2,[TMP0+TMP1] |
32 |
psadbw %1,mm0 |
psadbw %1,mm0 |
33 |
psadbw %2,mm0 |
psadbw %2,mm0 |
34 |
lea ecx, [ecx+2*edx] |
lea TMP0, [TMP0+2*TMP1] |
35 |
paddw %1, %2 |
paddw %1, %2 |
36 |
%endmacro |
%endmacro |
37 |
|
|
38 |
%macro CONSIM_1x8_SSE2 0 |
%macro CONSIM_1x8_SSE2 0 |
39 |
movdqu xmm0,[ecx] |
movdqu xmm0,[TMP0] |
40 |
movdqu xmm1,[edx] |
movdqu xmm1,[TMP1] |
41 |
|
|
42 |
;unpack to words |
;unpack to words |
43 |
punpcklbw xmm0,xmm2 |
punpcklbw xmm0,xmm2 |
56 |
%endmacro |
%endmacro |
57 |
|
|
58 |
%macro CONSIM_1x8_MMX 0 |
%macro CONSIM_1x8_MMX 0 |
59 |
movq mm0,[ecx];orig |
movq mm0,[TMP0];orig |
60 |
movq mm1,[edx];comp |
movq mm1,[TMP1];comp |
61 |
|
|
62 |
;unpack low half of qw to words |
;unpack low half of qw to words |
63 |
punpcklbw mm0,mm2 |
punpcklbw mm0,mm2 |
74 |
pmaddwd mm1,mm0 |
pmaddwd mm1,mm0 |
75 |
paddd mm7,mm1 |
paddd mm7,mm1 |
76 |
|
|
77 |
movq mm0,[ecx];orig |
movq mm0,[TMP0];orig |
78 |
movq mm1,[edx];comp |
movq mm1,[TMP1];comp |
79 |
|
|
80 |
;unpack high half of qw to words |
;unpack high half of qw to words |
81 |
punpckhbw mm0,mm2 |
punpckhbw mm0,mm2 |
94 |
%endmacro |
%endmacro |
95 |
|
|
96 |
%macro CONSIM_WRITEOUT 3 |
%macro CONSIM_WRITEOUT 3 |
97 |
mov eax,[esp + 16];lumo |
mov _EAX,prm4;lumo |
98 |
mul eax; lumo^2 |
mul _EAX; lumo^2 |
99 |
add eax, 32 |
add _EAX, 32 |
100 |
shr eax,6; 64*lum0^2 |
shr _EAX,6; 64*lum0^2 |
101 |
movd ecx,%1 |
movd TMP0d,%1 |
102 |
sub ecx,eax |
sub TMP0,_EAX |
103 |
|
|
104 |
mov edx,[esp + 24]; pdevo |
mov TMP1,prm6; pdevo |
105 |
mov [edx],ecx |
mov [TMP1],TMP0 |
106 |
|
|
107 |
mov eax,[esp + 20];lumc |
mov eax,prm5d ;lumc |
108 |
mul eax; lumc^2 |
mul _EAX; lumc^2 |
109 |
add eax, 32 |
add _EAX, 32 |
110 |
shr eax,6; 64*lumc^2 |
shr _EAX,6; 64*lumc^2 |
111 |
movd ecx,%2 |
movd TMP0d,%2 |
112 |
sub ecx,eax |
sub TMP0,_EAX |
113 |
|
|
114 |
mov edx,[esp + 28]; pdevc |
mov TMP1,prm7; pdevc |
115 |
mov [edx],ecx |
mov [TMP1],TMP0 |
116 |
|
|
117 |
mov eax,[esp + 16];lumo |
mov _EAX,prm4;lumo |
118 |
mul dword [esp + 20]; lumo*lumc, should fit in eax |
mul prm5d; lumo*lumc, should fit in _EAX |
119 |
add eax, 32 |
add _EAX, 32 |
120 |
shr eax,6; 64*lumo*lumc |
shr _EAX,6; 64*lumo*lumc |
121 |
movd ecx,%3 |
movd TMP0d,%3 |
122 |
sub ecx,eax |
sub TMP0,_EAX |
123 |
|
|
124 |
mov edx,[esp + 32]; pcorr |
mov TMP1,prm8; pcorr |
125 |
mov [edx],ecx |
mov [TMP1],TMP0 |
126 |
%endmacro |
%endmacro |
127 |
|
|
128 |
|
|
129 |
SECTION .text |
SECTION .rotext align=SECTION_ALIGN |
130 |
|
|
131 |
cglobal lum_8x8_mmx |
cglobal lum_8x8_mmx |
132 |
cglobal consim_sse2 |
cglobal consim_sse2 |
134 |
|
|
135 |
;int lum_8x8_c(uint8_t* ptr, uint32_t stride) |
;int lum_8x8_c(uint8_t* ptr, uint32_t stride) |
136 |
|
|
137 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
138 |
lum_8x8_mmx: |
lum_8x8_mmx: |
139 |
mov ecx, [esp + 4] ;ptr |
mov TMP0, prm1 ;ptr |
140 |
mov edx, [esp + 8];stride |
mov TMP1, prm2 ;stride |
141 |
|
|
142 |
pxor mm0,mm0 |
pxor mm0,mm0 |
143 |
|
|
157 |
ret |
ret |
158 |
ENDFUNC |
ENDFUNC |
159 |
|
|
160 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
161 |
consim_sse2: |
consim_sse2: |
162 |
mov ecx,[esp+4] ;ptro |
mov TMP0,prm1 ;ptro |
163 |
mov edx,[esp+8] ;ptrc |
mov TMP1,prm2 ;ptrc |
164 |
mov eax,[esp+12];stride |
mov _EAX, prm3 ;stride |
165 |
|
|
166 |
pxor xmm2,xmm2;null vektor |
pxor xmm2,xmm2;null vektor |
167 |
pxor xmm5,xmm5;devo |
pxor xmm5,xmm5;devo |
177 |
pshufd xmm7,xmm7,00000000b |
pshufd xmm7,xmm7,00000000b |
178 |
|
|
179 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_SSE2 |
180 |
add ecx,eax |
add TMP0,_EAX |
181 |
add edx,eax |
add TMP1,_EAX |
182 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_SSE2 |
183 |
add ecx,eax |
add TMP0,_EAX |
184 |
add edx,eax |
add TMP1,_EAX |
185 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_SSE2 |
186 |
add ecx,eax |
add TMP0,_EAX |
187 |
add edx,eax |
add TMP1,_EAX |
188 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_SSE2 |
189 |
add ecx,eax |
add TMP0,_EAX |
190 |
add edx,eax |
add TMP1,_EAX |
191 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_SSE2 |
192 |
add ecx,eax |
add TMP0,_EAX |
193 |
add edx,eax |
add TMP1,_EAX |
194 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_SSE2 |
195 |
add ecx,eax |
add TMP0,_EAX |
196 |
add edx,eax |
add TMP1,_EAX |
197 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_SSE2 |
198 |
add ecx,eax |
add TMP0,_EAX |
199 |
add edx,eax |
add TMP1,_EAX |
200 |
CONSIM_1x8_SSE2 |
CONSIM_1x8_SSE2 |
201 |
|
|
202 |
;accumulate xmm5-7 |
;accumulate xmm5-7 |
223 |
|
|
224 |
|
|
225 |
|
|
226 |
ALIGN 16 |
ALIGN SECTION_ALIGN |
227 |
consim_mmx: |
consim_mmx: |
228 |
mov ecx,[esp+4] ;ptro |
mov TMP0,prm1 ;ptro |
229 |
mov edx,[esp+8] ;ptrc |
mov TMP1,prm2 ;ptrc |
230 |
mov eax,[esp+12];stride |
mov _EAX,prm3;stride |
231 |
pxor mm2,mm2;null |
pxor mm2,mm2;null |
232 |
pxor mm5,mm5;devo |
pxor mm5,mm5;devo |
233 |
pxor mm6,mm6;devc |
pxor mm6,mm6;devc |
234 |
pxor mm7,mm7;corr |
pxor mm7,mm7;corr |
235 |
|
|
236 |
CONSIM_1x8_MMX |
CONSIM_1x8_MMX |
237 |
add ecx,eax |
add TMP0,_EAX |
238 |
add edx,eax |
add TMP1,_EAX |
239 |
CONSIM_1x8_MMX |
CONSIM_1x8_MMX |
240 |
add ecx,eax |
add TMP0,_EAX |
241 |
add edx,eax |
add TMP1,_EAX |
242 |
CONSIM_1x8_MMX |
CONSIM_1x8_MMX |
243 |
add ecx,eax |
add TMP0,_EAX |
244 |
add edx,eax |
add TMP1,_EAX |
245 |
CONSIM_1x8_MMX |
CONSIM_1x8_MMX |
246 |
add ecx,eax |
add TMP0,_EAX |
247 |
add edx,eax |
add TMP1,_EAX |
248 |
CONSIM_1x8_MMX |
CONSIM_1x8_MMX |
249 |
add ecx,eax |
add TMP0,_EAX |
250 |
add edx,eax |
add TMP1,_EAX |
251 |
CONSIM_1x8_MMX |
CONSIM_1x8_MMX |
252 |
add ecx,eax |
add TMP0,_EAX |
253 |
add edx,eax |
add TMP1,_EAX |
254 |
CONSIM_1x8_MMX |
CONSIM_1x8_MMX |
255 |
add ecx,eax |
add TMP0,_EAX |
256 |
add edx,eax |
add TMP1,_EAX |
257 |
CONSIM_1x8_MMX |
CONSIM_1x8_MMX |
258 |
|
|
259 |
movq mm0,mm5 |
movq mm0,mm5 |